未验证 提交 3616d593 编写于 作者: W whs 提交者: GitHub

Remove fluid API (#1578)

上级 c728e779
......@@ -17,11 +17,9 @@ from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
import paddle
from paddle.nn.initializer import Constant, KaimingUniform
from paddle.nn import Conv2D
from paddle.fluid.dygraph.nn import Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from genotypes import PRIMITIVES
from genotypes import Genotype
from operations import *
......@@ -40,7 +38,7 @@ class ConvBN(paddle.nn.Layer):
name=name + "_conv" if name is not None else None,
initializer=KaimingUniform()),
bias_attr=False)
self.bn = BatchNorm(
self.bn = paddle.nn.BatchNorm(
num_channels=c_out,
param_attr=paddle.ParamAttr(
name=name + "_bn_scale" if name is not None else None,
......@@ -61,11 +59,11 @@ class ConvBN(paddle.nn.Layer):
class Classifier(paddle.nn.Layer):
def __init__(self, input_dim, num_classes, name=None):
super(Classifier, self).__init__()
self.pool2d = Pool2D(pool_type='avg', global_pooling=True)
self.fc = Linear(
input_dim=input_dim,
output_dim=num_classes,
param_attr=paddle.ParamAttr(
self.pool2d = paddle.nn.AdaptiveAvgPool2D(output_size=1)
self.fc = paddle.nn.Linear(
input_dim,
num_classes,
weight_attr=paddle.ParamAttr(
name=name + "_fc_weights" if name is not None else None,
initializer=KaimingUniform()),
bias_attr=paddle.ParamAttr(
......@@ -84,7 +82,7 @@ def drop_path(x, drop_prob):
keep_prob = 1. - drop_prob
mask = 1 - np.random.binomial(
1, drop_prob, size=[x.shape[0]]).astype(np.float32)
mask = to_variable(mask)
mask = paddle.to_tensor(mask)
x = paddle.multiply(x / keep_prob, mask)
return x
......@@ -150,8 +148,7 @@ class Cell(paddle.nn.Layer):
class AuxiliaryHeadCIFAR(paddle.nn.Layer):
def __init__(self, C, num_classes):
super(AuxiliaryHeadCIFAR, self).__init__()
self.avgpool = Pool2D(
pool_size=5, pool_stride=3, pool_padding=0, pool_type='avg')
self.avgpool = paddle.nn.AvgPool2D(5, stride=3, padding=0)
self.conv_bn1 = ConvBN(
c_curr=C,
c_out=128,
......@@ -228,8 +225,7 @@ class NetworkCIFAR(paddle.nn.Layer):
class AuxiliaryHeadImageNet(paddle.nn.Layer):
def __init__(self, C, num_classes):
super(AuxiliaryHeadImageNet, self).__init__()
self.avgpool = Pool2D(
pool_size=5, pool_stride=2, pool_padding=0, pool_type='avg')
self.avgpool = paddle.nn.AvgPool2D(5, stride=2, padding=0)
self.conv_bn1 = ConvBN(
c_curr=C,
c_out=128,
......
......@@ -17,10 +17,8 @@ from __future__ import division
from __future__ import print_function
import paddle
import paddle.fluid as fluid
from paddle.nn.initializer import Normal, KaimingUniform, Constant
from paddle.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from genotypes import PRIMITIVES
from operations import *
import paddleslim
......@@ -159,9 +157,9 @@ class Network(paddle.nn.Layer):
self.cells = paddle.nn.LayerList(cells)
self.global_pooling = Pool2D(pool_type='avg', global_pooling=True)
self.classifier = Linear(
input_dim=c_prev,
output_dim=num_classes,
param_attr=paddle.ParamAttr(initializer=KaimingUniform()),
c_prev,
num_classes,
weight_attr=paddle.ParamAttr(initializer=KaimingUniform()),
bias_attr=paddle.ParamAttr(initializer=KaimingUniform()))
self._initialize_alphas()
......
......@@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle
from paddle.nn import Conv2D
from paddle.fluid.dygraph.nn import Pool2D, BatchNorm
from paddle.nn import BatchNorm
from paddle.nn.initializer import Constant, KaimingUniform
......@@ -22,17 +22,15 @@ OPS = {
'none':
lambda C, stride, affine: Zero(stride),
'avg_pool_3x3':
lambda C, stride, affine: Pool2D(
pool_size=3,
pool_type="avg",
pool_stride=stride,
pool_padding=1),
lambda C, stride, affine: paddle.nn.AvgPool2D(
3,
stride=stride,
padding=1),
'max_pool_3x3':
lambda C, stride, affine: Pool2D(
pool_size=3,
pool_type="max",
pool_stride=stride,
pool_padding=1),
lambda C, stride, affine: paddle.nn.MaxPool2D(
3,
stride=stride,
padding=1),
'skip_connect':
lambda C, stride, affine: Identity()
if stride == 1 else FactorizedReduce(C, C, affine),
......@@ -67,7 +65,7 @@ class Zero(paddle.nn.Layer):
def __init__(self, stride):
super(Zero, self).__init__()
self.stride = stride
self.pool = Pool2D(pool_size=1, pool_stride=2)
self.pool = paddle.nn.MaxPool2D(1, stride=2)
def forward(self, x):
pooled = self.pool(x)
......
......@@ -22,8 +22,6 @@ import ast
import argparse
import functools
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
import reader
from model_search import Network
from paddleslim.nas.darts import DARTSearch
......@@ -72,26 +70,25 @@ def main(args):
is_shuffle=True,
args=args)
with fluid.dygraph.guard(place):
model = Network(args.init_channels, args.class_num, args.layers,
args.method)
searcher = DARTSearch(
model,
train_reader,
valid_reader,
place,
learning_rate=args.learning_rate,
batchsize=args.batch_size,
num_imgs=args.trainset_num,
arch_learning_rate=args.arch_learning_rate,
unrolled=args.unrolled,
num_epochs=args.epochs,
epochs_no_archopt=args.epochs_no_archopt,
use_multiprocess=args.use_multiprocess,
use_data_parallel=args.use_data_parallel,
save_dir=args.model_save_dir,
log_freq=args.log_freq)
searcher.train()
model = Network(args.init_channels, args.class_num, args.layers,
args.method)
searcher = DARTSearch(
model,
train_reader,
valid_reader,
place,
learning_rate=args.learning_rate,
batchsize=args.batch_size,
num_imgs=args.trainset_num,
arch_learning_rate=args.arch_learning_rate,
unrolled=args.unrolled,
num_epochs=args.epochs,
epochs_no_archopt=args.epochs_no_archopt,
use_multiprocess=args.use_multiprocess,
use_data_parallel=args.use_data_parallel,
save_dir=args.model_save_dir,
log_freq=args.log_freq)
searcher.train()
if __name__ == '__main__':
......
......@@ -23,8 +23,8 @@ import logging
import argparse
import functools
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddleslim.common import AvgrageMeter, get_logger
from paddleslim.nas.darts import count_parameters_in_MB
......@@ -72,8 +72,8 @@ def train(model, train_reader, optimizer, epoch, drop_path_prob, args):
for step_id, data in enumerate(train_reader()):
image_np, label_np = data
image = to_variable(image_np)
label = to_variable(label_np)
image = paddle.to_tensor(image_np)
label = paddle.to_tensor(label_np)
label.stop_gradient = True
logits, logits_aux = model(image, drop_path_prob, True)
......@@ -117,8 +117,8 @@ def valid(model, valid_reader, epoch, args):
for step_id, data in enumerate(valid_reader()):
image_np, label_np = data
image = to_variable(image_np)
label = to_variable(label_np)
image = paddle.to_tensor(image_np)
label = paddle.to_tensor(label_np)
logits, _ = model(image, 0, False)
prec1 = paddle.static.accuracy(input=logits, label=label, k=1)
prec5 = paddle.static.accuracy(input=logits, label=label, k=5)
......@@ -140,83 +140,75 @@ def main(args):
place = paddle.CUDAPlace(paddle.distributed.parallel.ParallelEnv().dev_id) \
if args.use_data_parallel else paddle.CUDAPlace(0)
with fluid.dygraph.guard(place):
genotype = eval("genotypes.%s" % args.arch)
model = Network(
C=args.init_channels,
num_classes=args.class_num,
layers=args.layers,
auxiliary=args.auxiliary,
genotype=genotype)
logger.info("param size = {:.6f}MB".format(
count_parameters_in_MB(model.parameters())))
device_num = paddle.distributed.parallel.ParallelEnv().nranks
step_per_epoch = int(args.trainset_num / (args.batch_size * device_num))
learning_rate = fluid.dygraph.CosineDecay(args.learning_rate,
step_per_epoch, args.epochs)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip)
optimizer = paddle.optimizer.Momentum(
learning_rate,
momentum=args.momentum,
regularization=fluid.regularizer.L2Decay(args.weight_decay),
parameter_list=model.parameters(),
grad_clip=clip)
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
train_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=args.use_multiprocess)
valid_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=args.use_multiprocess)
train_reader = reader.train_valid(
batch_size=args.batch_size,
is_train=True,
is_shuffle=True,
args=args)
valid_reader = reader.train_valid(
batch_size=args.batch_size,
is_train=False,
is_shuffle=False,
args=args)
if args.use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
train_loader.set_batch_generator(train_reader, places=place)
valid_loader.set_batch_generator(valid_reader, places=place)
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
paddle.distributed.parallel.ParallelEnv().local_rank == 0)
best_acc = 0
for epoch in range(args.epochs):
drop_path_prob = args.drop_path_prob * epoch / args.epochs
logger.info('Epoch {}, lr {:.6f}'.format(
epoch, optimizer.current_step_lr()))
train_top1 = train(model, train_loader, optimizer, epoch,
drop_path_prob, args)
logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
valid_top1 = valid(model, valid_loader, epoch, args)
if valid_top1 > best_acc:
best_acc = valid_top1
if save_parameters:
paddle.save(model.state_dict(),
args.model_save_dir + "/best_model")
logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:.6f}".
format(epoch, valid_top1, best_acc))
genotype = eval("genotypes.%s" % args.arch)
model = Network(
C=args.init_channels,
num_classes=args.class_num,
layers=args.layers,
auxiliary=args.auxiliary,
genotype=genotype)
logger.info("param size = {:.6f}MB".format(
count_parameters_in_MB(model.parameters())))
device_num = paddle.distributed.parallel.ParallelEnv().nranks
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(args.learning_rate,
args.epochs / 2)
clip = paddle.nn.ClipGradByGlobalNorm(args.grad_clip)
optimizer = paddle.optimizer.Momentum(
learning_rate,
momentum=args.momentum,
regularization=paddle.regularizer.L2Decay(args.weight_decay),
parameter_list=model.parameters(),
grad_clip=clip)
if args.use_data_parallel:
strategy = paddle.distributed.init_parallel_env()
model = paddle.DataParallel(model, strategy)
train_loader = paddle.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=args.use_multiprocess)
valid_loader = paddle.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True,
use_multiprocess=args.use_multiprocess)
train_reader = reader.train_valid(
batch_size=args.batch_size, is_train=True, is_shuffle=True, args=args)
valid_reader = reader.train_valid(
batch_size=args.batch_size, is_train=False, is_shuffle=False, args=args)
if args.use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
train_loader.set_batch_generator(train_reader, places=place)
valid_loader.set_batch_generator(valid_reader, places=place)
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
paddle.distributed.parallel.ParallelEnv().local_rank == 0)
best_acc = 0
for epoch in range(args.epochs):
drop_path_prob = args.drop_path_prob * epoch / args.epochs
logger.info('Epoch {}, lr {:.6f}'.format(epoch,
optimizer.current_step_lr()))
train_top1 = train(model, train_loader, optimizer, epoch,
drop_path_prob, args)
logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
valid_top1 = valid(model, valid_loader, epoch, args)
if valid_top1 > best_acc:
best_acc = valid_top1
if save_parameters:
paddle.save(model.state_dict(),
args.model_save_dir + "/best_model")
logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:.6f}".format(
epoch, valid_top1, best_acc))
if __name__ == '__main__':
......
......@@ -23,8 +23,8 @@ import logging
import argparse
import functools
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddleslim.common import AvgrageMeter, get_logger
from paddleslim.nas.darts import count_parameters_in_MB
......@@ -68,7 +68,7 @@ add_arg('use_data_parallel', ast.literal_eval, False, "The flag indicating whet
def cross_entropy_label_smooth(preds, targets, epsilon):
preds = paddle.nn.functional.softmax(preds)
targets_one_hot = fluid.one_hot(input=targets, depth=args.class_num)
targets_one_hot = paddle.nn.functional.one_hot(targets, args.class_num)
targets_smooth = paddle.nn.functional.label_smooth(
targets_one_hot, epsilon=epsilon, dtype="float32")
loss = paddle.nn.functional.cross_entropy(
......@@ -84,8 +84,8 @@ def train(model, train_reader, optimizer, epoch, args):
for step_id, data in enumerate(train_reader()):
image_np, label_np = data
image = to_variable(image_np)
label = to_variable(label_np)
image = paddle.to_tensor(image_np)
label = paddle.to_tensor(label_np)
label.stop_gradient = True
logits, logits_aux = model(image, True)
......@@ -130,8 +130,8 @@ def valid(model, valid_reader, epoch, args):
for step_id, data in enumerate(valid_reader()):
image_np, label_np = data
image = to_variable(image_np)
label = to_variable(label_np)
image = paddle.to_tensor(image_np)
label = paddle.to_tensor(label_np)
logits, _ = model(image, False)
prec1 = paddle.static.accuracy(input=logits, label=label, k=1)
prec5 = paddle.static.accuracy(input=logits, label=label, k=5)
......@@ -153,79 +153,72 @@ def main(args):
place = paddle.CUDAPlace(paddle.distributed.parallel.ParallelEnv().dev_id) \
if args.use_data_parallel else paddle.CUDAPlace(0)
with fluid.dygraph.guard(place):
genotype = eval("genotypes.%s" % args.arch)
model = Network(
C=args.init_channels,
num_classes=args.class_num,
layers=args.layers,
auxiliary=args.auxiliary,
genotype=genotype)
logger.info("param size = {:.6f}MB".format(
count_parameters_in_MB(model.parameters())))
device_num = paddle.distributed.parallel.ParallelEnv().nranks
step_per_epoch = int(args.trainset_num / (args.batch_size * device_num))
learning_rate = fluid.dygraph.ExponentialDecay(
args.learning_rate, step_per_epoch, args.decay_rate, staircase=True)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip)
optimizer = paddle.optimizer.Momentum(
learning_rate,
momentum=args.momentum,
regularization=fluid.regularizer.L2Decay(args.weight_decay),
parameter_list=model.parameters(),
grad_clip=clip)
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
train_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True)
valid_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True)
train_reader = fluid.io.batch(
reader.imagenet_reader(args.data_dir, 'train'),
batch_size=args.batch_size,
drop_last=True)
valid_reader = fluid.io.batch(
reader.imagenet_reader(args.data_dir, 'val'),
batch_size=args.batch_size)
if args.use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
train_loader.set_sample_list_generator(train_reader, places=place)
valid_loader.set_sample_list_generator(valid_reader, places=place)
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
paddle.distributed.parallel.ParallelEnv().local_rank == 0)
best_top1 = 0
for epoch in range(args.epochs):
logger.info('Epoch {}, lr {:.6f}'.format(epoch, optimizer.get_lr()))
train_top1, train_top5 = train(model, train_loader, optimizer,
epoch, args)
logger.info("Epoch {}, train_top1 {:.6f}, train_top5 {:.6f}".format(
epoch, train_top1, train_top5))
valid_top1, valid_top5 = valid(model, valid_loader, epoch, args)
if valid_top1 > best_top1:
best_top1 = valid_top1
if save_parameters:
paddle.save(model.state_dict(),
args.model_save_dir + "/best_model")
logger.info(
"Epoch {}, valid_top1 {:.6f}, valid_top5 {:.6f}, best_valid_top1 {:6f}".
format(epoch, valid_top1, valid_top5, best_top1))
genotype = eval("genotypes.%s" % args.arch)
model = Network(
C=args.init_channels,
num_classes=args.class_num,
layers=args.layers,
auxiliary=args.auxiliary,
genotype=genotype)
logger.info("param size = {:.6f}MB".format(
count_parameters_in_MB(model.parameters())))
device_num = paddle.distributed.parallel.ParallelEnv().nranks
step_per_epoch = int(args.trainset_num / (args.batch_size * device_num))
learning_rate = paddle.optimizer.lr.ExponentialDecay(args.learning_rate,
args.decay_rate)
clip = paddle.nn.ClipGradByGlobalNorm(args.grad_clip)
optimizer = paddle.optimizer.Momentum(
learning_rate,
momentum=args.momentum,
regularization=paddle.regularizer.L2Decay(args.weight_decay),
parameter_list=model.parameters(),
grad_clip=clip)
if args.use_data_parallel:
strategy = paddle.distributed.init_parallel_env()
model = paddle.DataParallel(model, strategy)
train_loader = paddle.io.DataLoader.from_generator(
capacity=64, use_double_buffer=True, iterable=True, return_list=True)
valid_loader = paddle.io.DataLoader.from_generator(
capacity=64, use_double_buffer=True, iterable=True, return_list=True)
train_reader = paddle.batch(
reader.imagenet_reader(args.data_dir, 'train'),
batch_size=args.batch_size,
drop_last=True)
valid_reader = paddle.batch(
reader.imagenet_reader(args.data_dir, 'val'),
batch_size=args.batch_size)
if args.use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
train_loader.set_sample_list_generator(train_reader, places=place)
valid_loader.set_sample_list_generator(valid_reader, places=place)
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
paddle.distributed.parallel.ParallelEnv().local_rank == 0)
best_top1 = 0
for epoch in range(args.epochs):
logger.info('Epoch {}, lr {:.6f}'.format(epoch, optimizer.get_lr()))
train_top1, train_top5 = train(model, train_loader, optimizer, epoch,
args)
logger.info("Epoch {}, train_top1 {:.6f}, train_top5 {:.6f}".format(
epoch, train_top1, train_top5))
valid_top1, valid_top5 = valid(model, valid_loader, epoch, args)
if valid_top1 > best_top1:
best_top1 = valid_top1
if save_parameters:
paddle.save(model.state_dict(),
args.model_save_dir + "/best_model")
logger.info(
"Epoch {}, valid_top1 {:.6f}, valid_top5 {:.6f}, best_valid_top1 {:6f}".
format(epoch, valid_top1, valid_top5, best_top1))
if __name__ == '__main__':
......
......@@ -22,7 +22,6 @@ import six
import numpy as np
import time
import paddle
import paddle.fluid as fluid
from paddle.fluid.framework import IrGraph
from paddle.framework import core
......@@ -244,7 +243,7 @@ class SampleTester(unittest.TestCase):
return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
def test_graph_transformation(self):
if not paddle.fluid.core.is_compiled_with_mkldnn():
if not paddle.framework.core.is_compiled_with_mkldnn():
return
infer_model_path = test_case_args.infer_model
......
......@@ -22,13 +22,8 @@ class PVANet():
def net(self, input, include_last_bn_relu=True, class_dim=1000):
conv1 = self._conv_bn_crelu(input, 16, 7, stride=2, name="conv1_1")
pool1 = fluid.layers.pool2d(
input=conv1,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max',
name='pool1')
pool1 = paddle.nn.functional.max_pool2d(
conv1, 3, stride=2, padding=1, name='pool1')
end_points = {}
conv2 = self._conv_stage(
......@@ -182,13 +177,8 @@ class PVANet():
paths.append(path_net)
if stride > 1:
path_net = fluid.layers.pool2d(
input,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max',
name=name + '_pool')
path_net = paddle.nn.functional.max_pool2d(
input, 3, stride=2, padding=1, name=name + '_pool')
path_net = self._conv_bn_relu(path_net, pool_path_outputs, 1,
name + '_poolproj')
paths.append(path_net)
......
......@@ -2,7 +2,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import math
__all__ = ["ResNet", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]
......@@ -50,12 +49,7 @@ class ResNet():
stride=2,
act='relu',
name=prefix_name + conv1_name)
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
conv = paddle.nn.functional.max_pool2d(conv, 3, stride=2, padding=1)
if layers >= 50:
for block in range(len(depth)):
......@@ -74,8 +68,7 @@ class ResNet():
stride=2 if i == 0 and block != 0 else 1,
name=conv_name)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
pool = paddle.nn.functional.adaptive_avg_pool2d(conv, 1)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
fc_name = fc_name if fc_name is None else prefix_name + fc_name
out = paddle.static.nn.fc(
......@@ -97,8 +90,7 @@ class ResNet():
is_first=block == i == 0,
name=conv_name)
pool = fluid.layers.pool2d(
input=conv, pool_type='avg', global_pooling=True)
pool = paddle.nn.functional.adaptive_avg_pool2d(conv, 1)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
fc_name = fc_name if fc_name is None else prefix_name + fc_name
out = paddle.static.nn.fc(
......
......@@ -19,7 +19,6 @@ from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
__all__ = [
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
......@@ -80,12 +79,7 @@ class ResNet():
act='relu',
name='conv1_3')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
conv = paddle.nn.functional.max_pool2d(conv, 3, stride=2, padding=1)
if layers >= 50:
for block in range(len(depth)):
......@@ -114,8 +108,7 @@ class ResNet():
if_first=block == i == 0,
name=conv_name)
pool = fluid.layers.pool2d(
input=conv, pool_type='avg', global_pooling=True)
pool = paddle.nn.functional.adaptive_avg_pool2d(conv, 1)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = paddle.static.nn.fc(
......@@ -164,13 +157,8 @@ class ResNet():
groups=1,
act=None,
name=None):
pool = fluid.layers.pool2d(
input=input,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
pool = paddle.nn.functional.avg_pool2d(
input, 2, stride=2, padding=0, ceil_mode=True)
conv = paddle.static.nn.conv2d(
input=pool,
......
......@@ -17,7 +17,6 @@ import datetime
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.nn.initializer import KaimingUniform
......@@ -154,7 +153,7 @@ class SlimFaceNet():
param_attr=paddle.ParamAttr(
name='linear_conv1x1_weights',
initializer=KaimingUniform(),
regularizer=fluid.regularizer.L2Decay(4e-4)),
regularizer=paddle.regularizer.L2Decay(4e-4)),
bias_attr=False)
bn_name = 'linear_conv1x1_bn'
x = paddle.static.nn.batch_norm(
......@@ -233,8 +232,7 @@ class SlimFaceNet():
def se_block(self, input, num_out_filter, ratio=4, name=None):
num_mid_filter = int(num_out_filter // ratio)
pool = fluid.layers.pool2d(
input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
paddle.nn.functional.adaptive_avg_pool2d(input, 1)
conv1 = paddle.static.nn.conv2d(
input=pool,
filter_size=1,
......@@ -247,7 +245,7 @@ class SlimFaceNet():
mode='channel',
param_attr=paddle.ParamAttr(
name=name + '_prelu',
regularizer=fluid.regularizer.L2Decay(0.0)))
regularizer=paddle.regularizer.L2Decay(0.0)))
conv2 = paddle.static.nn.conv2d(
input=conv1,
filter_size=1,
......@@ -293,7 +291,7 @@ class SlimFaceNet():
mode='channel',
param_attr=paddle.ParamAttr(
name=name + '_prelu',
regularizer=fluid.regularizer.L2Decay(0.0)))
regularizer=paddle.regularizer.L2Decay(0.0)))
else:
return bn
......@@ -307,12 +305,12 @@ class SlimFaceNet():
name='weight_norm',
attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Xavier(),
regularizer=fluid.regularizer.L2Decay(4e-4)))
regularizer=paddle.regularizer.L2Decay(4e-4)))
weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), dim=1))
weight = paddle.divide(weight, weight_norm, axis=0)
weight = paddle.transpose(weight, perm=[1, 0])
cosine = fluid.layers.mul(input, weight)
cosine = paddle.matmul(input, weight)
sine = paddle.sqrt(1.0 - paddle.square(cosine))
cos_m = math.cos(m)
......@@ -329,7 +327,7 @@ class SlimFaceNet():
else:
pass
one_hot = fluid.layers.one_hot(input=label, depth=out_dim)
one_hot = paddle.nn.functional.one_hot(label, out_dim)
output = paddle.multiply(one_hot, phi) + paddle.multiply(
(1.0 - one_hot), cosine)
output = output * s
......
......@@ -15,16 +15,13 @@
import os
import numpy as np
import paddle
import paddle.fluid as F
import paddle.fluid.dygraph as FD
import paddle.fluid.layers as L
def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[
'num_attention_heads']
head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32')
head_mask = L.ones(shape=[n_layers, n_heads], dtype='float32')
head_importance = paddle.zeros(shape=[n_layers, n_heads], dtype='float32')
head_mask = paddle.ones(shape=[n_layers, n_heads], dtype='float32')
head_mask.stop_gradient = False
intermediate_weight = []
......@@ -60,7 +57,8 @@ def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
num_layers=model_cfg['num_hidden_layers'])
loss = out[0]
loss.backward()
head_importance += L.abs(FD.to_variable(head_mask.gradient()))
head_importance += paddle.abs(
paddle.to_tensor(head_mask.gradient()))
for w1, b1, w2, current_importance in zip(
intermediate_weight, intermediate_bias, output_weight,
......@@ -78,34 +76,36 @@ def reorder_neuron_head(model, head_importance, neuron_importance):
# reorder heads and ffn neurons
for layer, current_importance in enumerate(neuron_importance):
# reorder heads
idx = L.argsort(head_importance[layer], descending=True)[-1]
idx = paddle.argsort(head_importance[layer], descending=True)[-1]
#model.encoder_stack.block[layer].attn.reorder_heads(idx)
reorder_head(model.encoder_stack.block[layer].attn, idx)
# reorder neurons
idx = L.argsort(FD.to_variable(current_importance), descending=True)[-1]
idx = paddle.argsort(
paddle.to_tensor(current_importance), descending=True)[-1]
#model.encoder_stack.block[layer].ffn.reorder_neurons(idx)
reorder_neuron(model.encoder_stack.block[layer].ffn, idx)
def reorder_head(layer, idx):
n, a = layer.n_head, layer.d_key
index = L.reshape(
L.index_select(
L.reshape(
L.arange(
index = paddle.reshape(
paddle.index_select(
paddle.reshape(
paddle.arange(
0, n * a, dtype='int64'), shape=[n, a]),
idx,
dim=0),
axis=0),
shape=[-1])
def reorder_head_matrix(linearLayer, index, dim=1):
W = L.index_select(linearLayer.weight, index, dim=dim).detach()
W = paddle.index_select(linearLayer.weight, index, axis=dim).detach()
if linearLayer.bias is not None:
if dim == 0:
b = L.assign(linearLayer.bias).detach()
b = paddle.assign(linearLayer.bias).detach()
else:
b = L.assign(L.index_select(
linearLayer.bias, index, dim=0)).detach()
b = paddle.assign(
L.index_select(
linearLayer.bias, index, dim=0)).detach()
linearLayer.weight.stop_gradient = True
linearLayer.weight.set_value(W)
......@@ -127,13 +127,14 @@ def reorder_head(layer, idx):
def reorder_neuron(layer, index, dim=0):
def reorder_neurons_matrix(linearLayer, index, dim):
W = L.index_select(linearLayer.weight, index, dim=dim).detach()
W = paddle.index_select(linearLayer.weight, index, axis=dim).detach()
if linearLayer.bias is not None:
if dim == 0:
b = L.assign(linearLayer.bias).detach()
b = paddle.assign(linearLayer.bias).detach()
else:
b = L.assign(L.index_select(
linearLayer.bias, index, dim=0)).detach()
b = paddle.assign(
L.index_select(
linearLayer.bias, index, dim=0)).detach()
linearLayer.weight.stop_gradient = True
linearLayer.weight.set_value(W)
linearLayer.weight.stop_gradient = False
......
......@@ -32,9 +32,6 @@ else:
from pathlib import Path
import paddle
import paddle.fluid.dygraph as D
import paddle.fluid as F
import paddle.fluid.layers as L
from ernie.file_utils import _fetch_from_remote
from ernie.modeling_ernie import AttentionLayer, ErnieBlock, ErnieModel, ErnieEncoderStack, ErnieModelForSequenceClassification
......@@ -66,8 +63,8 @@ def _attn_forward(self,
cache = (k, v)
if past_cache is not None:
cached_k, cached_v = past_cache
k = L.concat([cached_k, k], 1)
v = L.concat([cached_v, v], 1)
k = paddle.concat([cached_k, k], 1)
v = paddle.concat([cached_v, v], 1)
if hasattr(self.q, 'fn') and self.q.fn.cur_config['expand_ratio'] != None:
n_head = int(self.n_head * self.q.fn.cur_config['expand_ratio'])
......@@ -84,19 +81,19 @@ def _attn_forward(self,
paddle.reshape(v, [0, 0, n_head, v.shape[-1] // n_head]),
[0, 2, 1, 3]) #[batch, head, seq, dim]
q = L.scale(q, scale=self.d_key**-0.5)
score = L.matmul(q, k, transpose_y=True)
q = paddle.scale(q, scale=self.d_key**-0.5)
score = paddle.matmul(q, k, transpose_y=True)
if attn_bias is not None:
score += attn_bias
score = L.softmax(score, use_cudnn=True)
score = paddle.nn.functional.softmax(score, use_cudnn=True)
score = self.dropout(score)
if head_mask is not None:
score = score * head_mask
out = L.matmul(score, v)
out = L.transpose(out, [0, 2, 1, 3])
out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]])
out = paddle.matmul(score, v)
out = paddle.transpose(out, [0, 2, 1, 3])
out = paddle.reshape(out, [0, 0, out.shape[2] * out.shape[3]])
out = self.o(out)
return out, cache
......@@ -188,23 +185,25 @@ def _ernie_model_forward(self,
) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (
repr(src_ids.shape))
assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None'
d_batch = L.shape(src_ids)[0]
d_seqlen = L.shape(src_ids)[1]
d_batch = paddle.shape(src_ids)[0]
d_seqlen = paddle.shape(src_ids)[1]
if pos_ids is None:
pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1])
pos_ids = L.cast(pos_ids, 'int64')
pos_ids = paddle.reshape(
L.range(
0, d_seqlen, 1, dtype='int32'), [1, -1])
pos_ids = paddle.cast(pos_ids, 'int64')
if attn_bias is None:
if input_mask is None:
input_mask = L.cast(src_ids != 0, 'float32')
input_mask = paddle.cast(src_ids != 0, 'float32')
assert len(input_mask.shape) == 2
input_mask = L.unsqueeze(input_mask, axes=[-1])
attn_bias = L.matmul(input_mask, input_mask, transpose_y=True)
input_mask = paddle.unsqueeze(input_mask, axis=[-1])
attn_bias = paddle.matmul(input_mask, input_mask, transpose_y=True)
if use_causal_mask:
sequence = L.reshape(
L.range(
sequence = paddle.reshape(
paddle.arange(
0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1])
causal_mask = L.cast(
(L.matmul(
causal_mask = paddle.cast(
(paddle.matmul(
sequence, 1. / sequence, transpose_y=True) >= 1.),
'float32')
attn_bias *= causal_mask
......@@ -213,21 +212,23 @@ def _ernie_model_forward(self,
attn_bias.shape
) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape
attn_bias = (1. - attn_bias) * -10000.0
attn_bias = L.unsqueeze(attn_bias, [1])
attn_bias = paddle.unsqueeze(attn_bias, [1])
attn_bias.stop_gradient = True
if sent_ids is None:
sent_ids = L.zeros_like(src_ids)
sent_ids = paddle.zeros_like(src_ids)
if head_mask is not None:
if len(head_mask.shape) == 1:
head_mask = L.unsqueeze(
L.unsqueeze(L.unsqueeze(L.unsqueeze(head_mask, 0), 0), -1), -1)
head_mask = L.expand(
head_mask, expand_times=[num_layers, 1, 1, 1, 1])
head_mask = paddle.unsqueeze(
paddle.unsqueeze(
paddle.unsqueeze(paddle.unsqueeze(head_mask, 0), 0), -1),
-1)
head_mask = paddle.expand(
head_mask, shape=[head_mask.shape[0] * num_layers, 1, 1, 1, 1])
elif len(head_mask.shape) == 2:
head_mask = L.unsqueeze(
L.unsqueeze(L.unsqueeze(head_mask, 1), -1), -1)
head_mask = paddle.unsqueeze(
paddle.unsqueeze(paddle.unsqueeze(head_mask, 1), -1), -1)
else:
head_mask = [None] * num_layers
......@@ -274,8 +275,8 @@ def _seqence_forward(self, *args, **kwargs):
if labels is not None:
if len(labels.shape) == 1:
labels = L.reshape(labels, [-1, 1])
loss = L.softmax_with_cross_entropy(logits, labels)
labels = paddle.reshape(labels, [-1, 1])
loss = paddle.nn.functional.softmax_with_cross_entropy(logits, labels)
loss = paddle.mean(loss)
else:
loss = None
......
......@@ -19,12 +19,10 @@ from __future__ import unicode_literals
from __future__ import absolute_import
import re
import paddle.fluid as F
import paddle.fluid.layers as L
import paddle.fluid.dygraph as D
import paddle
class AdamW(F.optimizer.AdamOptimizer):
class AdamW(padlde.optimizer.Adam):
"""AdamW object for dygraph"""
def __init__(self, *args, **kwargs):
......@@ -39,5 +37,6 @@ class AdamW(F.optimizer.AdamOptimizer):
super(AdamW, self).apply_optimize(loss, startup_program, params_grads)
for p, g in params_grads:
if not self.pat.match(p.name):
with D.no_grad():
L.assign(p * (1. - self.wd * self.current_step_lr()), p)
with paddle.no_grad():
paddle.assign(p * (1. - self.wd * self.current_step_lr()),
p)
......@@ -26,9 +26,6 @@ import logging
import argparse
import paddle
import paddle.fluid as F
import paddle.fluid.dygraph as FD
import paddle.fluid.layers as L
from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig, utils
from propeller import log
......@@ -44,9 +41,9 @@ from paddleslim.nas.ofa.convert_super import Convert, supernet
def soft_cross_entropy(inp, target):
inp_likelihood = L.log_softmax(inp, axis=-1)
target_prob = L.softmax(target, axis=-1)
return -1. * L.mean(paddle.sum(inp_likelihood * target_prob, dim=-1))
inp_likelihood = paddle.nn.functional.log_softmax(inp, axis=-1)
target_prob = paddle.nn.functional.softmax(target, axis=-1)
return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, dim=-1))
if __name__ == '__main__':
......@@ -194,200 +191,193 @@ if __name__ == '__main__':
dev_ds.data_shapes = shapes
dev_ds.data_types = types
place = F.CUDAPlace(0)
with FD.guard(place):
model = ErnieModelForSequenceClassification.from_pretrained(
args.from_pretrained, num_labels=3, name='')
setattr(model, 'return_additional_info', True)
origin_weights = {}
for name, param in model.named_parameters():
origin_weights[name] = param
sp_config = supernet(expand_ratio=args.width_mult_list)
model = Convert(sp_config).convert(model)
utils.set_state_dict(model, origin_weights)
del origin_weights
teacher_model = ErnieModelForSequenceClassification.from_pretrained(
args.from_pretrained, num_labels=3, name='teacher')
setattr(teacher_model, 'return_additional_info', True)
default_run_config = {
'n_epochs': [[4 * args.epoch], [6 * args.epoch]],
'init_learning_rate': [[args.lr], [args.lr]],
'elastic_depth': args.depth_mult_list,
'dynamic_batch_size': [[1, 1], [1, 1]]
}
run_config = RunConfig(**default_run_config)
model_cfg = get_config(args.from_pretrained)
default_distill_config = {'teacher_model': teacher_model}
distill_config = DistillConfig(**default_distill_config)
ofa_model = OFA(model,
run_config,
distill_config=distill_config,
elastic_order=['width', 'depth'])
### suppose elastic width first
if args.reorder_weight:
head_importance, neuron_importance = compute_neuron_head_importance(
args, ofa_model.model, dev_ds, place, model_cfg)
reorder_neuron_head(ofa_model.model, head_importance,
neuron_importance)
#################
if args.init_checkpoint is not None:
log.info('loading checkpoint from %s' % args.init_checkpoint)
sd, _ = FD.load_dygraph(args.init_checkpoint)
ofa_model.model.set_dict(sd)
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
if args.use_lr_decay:
opt = AdamW(
learning_rate=LinearDecay(args.lr,
int(args.warmup_proportion *
args.max_steps), args.max_steps),
parameter_list=ofa_model.model.parameters(),
weight_decay=args.wd,
grad_clip=g_clip)
place = paddle.CUDAPlace(0)
model = ErnieModelForSequenceClassification.from_pretrained(
args.from_pretrained, num_labels=3, name='')
setattr(model, 'return_additional_info', True)
origin_weights = {}
for name, param in model.named_parameters():
origin_weights[name] = param
sp_config = supernet(expand_ratio=args.width_mult_list)
model = Convert(sp_config).convert(model)
utils.set_state_dict(model, origin_weights)
del origin_weights
teacher_model = ErnieModelForSequenceClassification.from_pretrained(
args.from_pretrained, num_labels=3, name='teacher')
setattr(teacher_model, 'return_additional_info', True)
default_run_config = {
'n_epochs': [[4 * args.epoch], [6 * args.epoch]],
'init_learning_rate': [[args.lr], [args.lr]],
'elastic_depth': args.depth_mult_list,
'dynamic_batch_size': [[1, 1], [1, 1]]
}
run_config = RunConfig(**default_run_config)
model_cfg = get_config(args.from_pretrained)
default_distill_config = {'teacher_model': teacher_model}
distill_config = DistillConfig(**default_distill_config)
ofa_model = OFA(model,
run_config,
distill_config=distill_config,
elastic_order=['width', 'depth'])
### suppose elastic width first
if args.reorder_weight:
head_importance, neuron_importance = compute_neuron_head_importance(
args, ofa_model.model, dev_ds, place, model_cfg)
reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
#################
if args.init_checkpoint is not None:
log.info('loading checkpoint from %s' % args.init_checkpoint)
sd, _ = paddle.load(args.init_checkpoint)
ofa_model.model.set_dict(sd)
g_clip = paddle.nn.ClipGradByGlobalNorm(1.0) #experimental
if args.use_lr_decay:
opt = AdamW(
learning_rate=LinearDecay(args.lr,
int(args.warmup_proportion *
args.max_steps), args.max_steps),
parameter_list=ofa_model.model.parameters(),
weight_decay=args.wd,
grad_clip=g_clip)
else:
opt = AdamW(
args.lr,
parameter_list=ofa_model.model.parameters(),
weight_decay=args.wd,
grad_clip=g_clip)
for epoch in range(max(run_config.n_epochs[-1])):
ofa_model.set_epoch(epoch)
if epoch <= int(max(run_config.n_epochs[0])):
ofa_model.set_task('width')
depth_mult_list = [1.0]
else:
opt = AdamW(
args.lr,
parameter_list=ofa_model.model.parameters(),
weight_decay=args.wd,
grad_clip=g_clip)
for epoch in range(max(run_config.n_epochs[-1])):
ofa_model.set_epoch(epoch)
if epoch <= int(max(run_config.n_epochs[0])):
ofa_model.set_task('width')
depth_mult_list = [1.0]
else:
ofa_model.set_task('depth')
depth_mult_list = run_config.elastic_depth
for step, d in enumerate(
tqdm(
train_ds.start(place), desc='training')):
ids, sids, label = d
accumulate_gradients = dict()
for param in opt._parameter_list:
accumulate_gradients[param.name] = 0.0
ofa_model.set_task('depth')
depth_mult_list = run_config.elastic_depth
for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
ids, sids, label = d
accumulate_gradients = dict()
for param in opt._parameter_list:
accumulate_gradients[param.name] = 0.0
for depth_mult in depth_mult_list:
for width_mult in args.width_mult_list:
net_config = utils.dynabert_config(
ofa_model, width_mult, depth_mult=depth_mult)
ofa_model.set_net_config(net_config)
student_output, teacher_output = ofa_model(
ids,
sids,
labels=label,
num_layers=model_cfg['num_hidden_layers'])
loss, student_logit, student_reps = student_output[
0], student_output[1], student_output[2]['hiddens']
teacher_logit, teacher_reps = teacher_output[
1], teacher_output[2]['hiddens']
if ofa_model.task == 'depth':
depth_mult = ofa_model.current_config['depth']
depth = round(model_cfg['num_hidden_layers'] *
depth_mult)
kept_layers_index = []
for i in range(1, depth + 1):
kept_layers_index.append(
math.floor(i / depth_mult) - 1)
if mode == 'classification':
logit_loss = soft_cross_entropy(
student_logit, teacher_logit.detach())
else:
logit_loss = 0.0
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(
student_reps,
list(teacher_reps[i]
for i in kept_layers_index)):
tmp_loss = paddle.nn.functional.mse_loss(
stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
else:
### logit distillation loss
if mode == 'classification':
logit_loss = soft_cross_entropy(
student_logit, teacher_logit.detach())
else:
logit_loss = 0.0
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(student_reps, teacher_reps):
tmp_loss = paddle.nn.functional.mse_loss(
stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
if step % 10 == 0:
print('train loss %.5f lr %.3e' %
(loss.numpy(), opt.current_step_lr()))
loss.backward()
param_grads = opt.backward(loss)
for param in opt._parameter_list:
accumulate_gradients[param.name] += param.gradient()
for k, v in param_grads:
assert k.name in accumulate_gradients.keys(
), "{} not in accumulate_gradients".format(k.name)
v.set_value(accumulate_gradients[k.name])
opt.apply_optimize(
loss, startup_program=None, params_grads=param_grads)
ofa_model.model.clear_gradients()
if step % 100 == 0:
for depth_mult in depth_mult_list:
for width_mult in args.width_mult_list:
net_config = utils.dynabert_config(
ofa_model, width_mult, depth_mult=depth_mult)
ofa_model.set_net_config(net_config)
student_output, teacher_output = ofa_model(
ids,
sids,
labels=label,
num_layers=model_cfg['num_hidden_layers'])
loss, student_logit, student_reps = student_output[
0], student_output[1], student_output[2]['hiddens']
teacher_logit, teacher_reps = teacher_output[
1], teacher_output[2]['hiddens']
if ofa_model.task == 'depth':
depth_mult = ofa_model.current_config['depth']
depth = round(model_cfg['num_hidden_layers'] *
depth_mult)
kept_layers_index = []
for i in range(1, depth + 1):
kept_layers_index.append(
math.floor(i / depth_mult) - 1)
if mode == 'classification':
logit_loss = soft_cross_entropy(
student_logit, teacher_logit.detach())
else:
logit_loss = 0.0
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(
student_reps,
list(teacher_reps[i]
for i in kept_layers_index)):
tmp_loss = L.mse_loss(stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
else:
### logit distillation loss
if mode == 'classification':
logit_loss = soft_cross_entropy(
student_logit, teacher_logit.detach())
else:
logit_loss = 0.0
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(student_reps,
teacher_reps):
tmp_loss = L.mse_loss(stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
if step % 10 == 0:
print('train loss %.5f lr %.3e' %
(loss.numpy(), opt.current_step_lr()))
loss.backward()
param_grads = opt.backward(loss)
for param in opt._parameter_list:
accumulate_gradients[param.name] += param.gradient()
for k, v in param_grads:
assert k.name in accumulate_gradients.keys(
), "{} not in accumulate_gradients".format(k.name)
v.set_value(accumulate_gradients[k.name])
opt.apply_optimize(
loss, startup_program=None, params_grads=param_grads)
ofa_model.model.clear_gradients()
if step % 100 == 0:
for depth_mult in depth_mult_list:
for width_mult in args.width_mult_list:
net_config = utils.dynabert_config(
ofa_model, width_mult, depth_mult=depth_mult)
ofa_model.set_net_config(net_config)
acc = []
tea_acc = []
with FD.base._switch_tracer_mode_guard_(
is_train=False):
ofa_model.model.eval()
for step, d in enumerate(
tqdm(
dev_ds.start(place),
desc='evaluating %d' % epoch)):
ids, sids, label = d
[loss, logits,
_], [_, tea_logits, _] = ofa_model(
ids,
sids,
labels=label,
num_layers=model_cfg[
'num_hidden_layers'])
a = L.argmax(logits, -1) == label
acc.append(a.numpy())
ta = L.argmax(tea_logits, -1) == label
tea_acc.append(ta.numpy())
ofa_model.model.train()
print(
'width_mult: %f, depth_mult: %f: acc %.5f, teacher acc %.5f'
% (width_mult, depth_mult,
np.concatenate(acc).mean(),
np.concatenate(tea_acc).mean()))
acc = []
tea_acc = []
ofa_model.model.eval()
for step, d in enumerate(
tqdm(
dev_ds.start(place),
desc='evaluating %d' % epoch)):
ids, sids, label = d
[loss, logits, _], [_, tea_logits, _] = ofa_model(
ids,
sids,
labels=label,
num_layers=model_cfg['num_hidden_layers'])
a = paddle.argmax(logits, -1) == label
acc.append(a.numpy())
ta = paddle.argmax(tea_logits, -1) == label
tea_acc.append(ta.numpy())
ofa_model.model.train()
print(
'width_mult: %f, depth_mult: %f: acc %.5f, teacher acc %.5f'
% (width_mult, depth_mult,
np.concatenate(acc).mean(),
np.concatenate(tea_acc).mean()))
if args.save_dir is not None:
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
F.save_dygraph(ofa_model.model.state_dict(), args.save_dir)
paddle.save(ofa_model.model.state_dict(), args.save_dir)
......@@ -107,8 +107,8 @@ def test_ofa():
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = paddle.dygraph.to_variable(dy_x_data)
label = paddle.dygraph.to_variable(y_data)
img = paddle.to_tensor(dy_x_data)
label = paddle.to_tensor(y_data)
label.stop_gradient = True
for model_no in range(run_config.dynamic_batch_size[idx]):
......
......@@ -4,11 +4,11 @@ from webbrowser import get
import paddle
from paddle import tensor
from paddle.autograd import PyLayer
from paddle.fluid import layers
from paddle.nn import functional as F
from paddle.nn.layer.common import Linear, Embedding
from paddle.nn.layer.transformer import MultiHeadAttention, _convert_attention_mask
class BinaryQuantizer(PyLayer):
@staticmethod
def forward(ctx, input):
......@@ -24,6 +24,7 @@ class BinaryQuantizer(PyLayer):
grad_input[input <= -1] = 0
return grad_input.clone()
class ZMeanBinaryQuantizer(PyLayer):
@staticmethod
def forward(ctx, input):
......@@ -39,43 +40,86 @@ class ZMeanBinaryQuantizer(PyLayer):
grad_input[input <= -1] = 0
return grad_input.clone()
class BiLinear(Linear):
def __init__(self, in_features, out_features, weight_attr=None, bias_attr=None, name=None):
super(BiLinear, self).__init__(in_features, out_features, weight_attr=weight_attr, bias_attr=bias_attr, name=name)
def __init__(self,
in_features,
out_features,
weight_attr=None,
bias_attr=None,
name=None):
super(BiLinear, self).__init__(
in_features,
out_features,
weight_attr=weight_attr,
bias_attr=bias_attr,
name=name)
def forward(self, input):
scaling_factor = paddle.mean(self.weight.abs(), axis=1).unsqueeze(1).detach()
real_weights = self.weight - paddle.mean(self.weight, axis=-1).unsqueeze(-1)
scaling_factor = paddle.mean(
self.weight.abs(), axis=1).unsqueeze(1).detach()
real_weights = self.weight - paddle.mean(
self.weight, axis=-1).unsqueeze(-1)
binary_weights_no_grad = scaling_factor * paddle.sign(real_weights)
cliped_weights = paddle.clip(real_weights, -1.0, 1.0)
weight = binary_weights_no_grad.detach() - cliped_weights.detach() + cliped_weights
weight = binary_weights_no_grad.detach() - cliped_weights.detach(
) + cliped_weights
binary_input_no_grad = paddle.sign(input)
cliped_input = paddle.clip(input, -1.0, 1.0)
ba = binary_input_no_grad.detach() - cliped_input.detach() + cliped_input
ba = binary_input_no_grad.detach() - cliped_input.detach(
) + cliped_input
out = F.linear(x=ba, weight=weight, bias=self.bias, name=self.name)
return out
class BiEmbedding(Embedding):
def __init__(self, num_embeddings, embedding_dim, padding_idx=None, sparse=False, weight_attr=None, name=None):
super(BiEmbedding, self).__init__(num_embeddings, embedding_dim, padding_idx, sparse, weight_attr, name)
def __init__(self,
num_embeddings,
embedding_dim,
padding_idx=None,
sparse=False,
weight_attr=None,
name=None):
super(BiEmbedding,
self).__init__(num_embeddings, embedding_dim, padding_idx, sparse,
weight_attr, name)
def forward(self, x):
scaling_factor = paddle.mean(self.weight.abs(), axis=1, keepdim=True)
scaling_factor = scaling_factor.detach()
real_weights = self.weight - paddle.mean(self.weight, axis=-1, keepdim=True)
real_weights = self.weight - paddle.mean(
self.weight, axis=-1, keepdim=True)
binary_weights_no_grad = scaling_factor * paddle.sign(real_weights)
cliped_weights = paddle.clip(real_weights, -1.0, 1.0)
weight = binary_weights_no_grad.detach() - cliped_weights.detach() + cliped_weights
return F.embedding(x, weight=weight, padding_idx=self._padding_idx, sparse=self._sparse, name=self._name)
weight = binary_weights_no_grad.detach() - cliped_weights.detach(
) + cliped_weights
return F.embedding(
x,
weight=weight,
padding_idx=self._padding_idx,
sparse=self._sparse,
name=self._name)
class BiMultiHeadAttention(MultiHeadAttention):
# fork from paddle.nn.layer.transformer.MultiHeadAttention
Cache = collections.namedtuple("Cache", ["k", "v"])
StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, bias_attr=None):
super(BiMultiHeadAttention, self).__init__(embed_dim, num_heads, dropout, kdim, vdim, need_weights, weight_attr, bias_attr)
def __init__(self,
embed_dim,
num_heads,
dropout=0.,
kdim=None,
vdim=None,
need_weights=False,
weight_attr=None,
bias_attr=None):
super(BiMultiHeadAttention,
self).__init__(embed_dim, num_heads, dropout, kdim, vdim,
need_weights, weight_attr, bias_attr)
def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
key = query if key is None else key
......@@ -85,14 +129,12 @@ class BiMultiHeadAttention(MultiHeadAttention):
q, k, v = self._prepare_qkv(query, key, value, cache)
else:
q, k, v, cache = self._prepare_qkv(query, key, value, cache)
q = BinaryQuantizer.apply(q)
k = BinaryQuantizer.apply(k)
# scale dot product attention
# TODO(guosheng): use tensor.matmul, however it doesn't support `alpha`
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.scale(product, scale=self.head_dim**-0.5)
if attn_mask is not None:
# Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype)
......@@ -123,17 +165,14 @@ class BiMultiHeadAttention(MultiHeadAttention):
outs.append(cache)
return out if len(outs) == 1 else tuple(outs)
def _to_bi_function(model):
for name, layer in model.named_children():
if isinstance(layer, MultiHeadAttention):
new_layer = BiMultiHeadAttention(layer.embed_dim,
layer.num_heads,
layer.dropout,
layer.kdim,
layer.vdim,
layer.need_weights,
layer.q_proj._weight_attr,
layer.q_proj._bias_attr)
new_layer = BiMultiHeadAttention(
layer.embed_dim, layer.num_heads, layer.dropout, layer.kdim,
layer.vdim, layer.need_weights, layer.q_proj._weight_attr,
layer.q_proj._bias_attr)
new_layer.q_proj = layer.q_proj
new_layer.k_proj = layer.k_proj
new_layer.v_proj = layer.v_proj
......@@ -141,27 +180,30 @@ def _to_bi_function(model):
model._sub_layers[name] = new_layer
elif isinstance(layer, Embedding):
if name != "word_embeddings": continue
new_layer = BiEmbedding(layer._num_embeddings,
layer._embedding_dim,
layer._padding_idx,
layer._sparse,
layer._weight_attr,
layer._name)
new_layer = BiEmbedding(layer._num_embeddings, layer._embedding_dim,
layer._padding_idx, layer._sparse,
layer._weight_attr, layer._name)
new_layer.weight = layer.weight
model._sub_layers[name] = new_layer
elif isinstance(layer, Linear):
if name == "classifier": continue
new_layer = BiLinear(layer.weight.shape[0],
layer.weight.shape[1],
layer._weight_attr,
layer._bias_attr,
layer.name)
new_layer = BiLinear(layer.weight.shape[0], layer.weight.shape[1],
layer._weight_attr, layer._bias_attr,
layer.name)
new_layer.weight = layer.weight
new_layer.bias = layer.bias
model._sub_layers[name] = new_layer
import math
def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=None, cache=None):
def _MultiHeadAttention_forward(self,
query,
key=None,
value=None,
attn_mask=None,
cache=None):
key = query if key is None else key
value = query if value is None else value
# compute q ,k ,v
......@@ -169,18 +211,16 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non
q, k, v = self._prepare_qkv(query, key, value, cache)
else:
q, k, v, cache = self._prepare_qkv(query, key, value, cache)
# distill qxq
query_scores = paddle.matmul(q, tensor.transpose(x=q, perm=[0, 1, 3, 2]))
query_scores = query_scores / math.sqrt(self.head_dim)
# distill kxk
key_scores = paddle.matmul(k, tensor.transpose(x=k, perm=[0, 1, 3, 2]))
key_scores = key_scores / math.sqrt(self.head_dim)
# scale dot product attention
# TODO(guosheng): use tensor.matmul, however it doesn't support `alpha`
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.scale(product, scale=self.head_dim**-0.5)
if attn_mask is not None:
# Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype)
......@@ -192,7 +232,7 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non
self.dropout,
training=self.training,
mode="upscale_in_train")
# distil vxv
value_scores = paddle.matmul(v, tensor.transpose(x=v, perm=[0, 1, 3, 2]))
value_scores = value_scores / math.sqrt(self.head_dim)
......@@ -210,13 +250,19 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non
outs.append(weights)
if cache is not None:
outs.append(cache)
self.query_scores = query_scores
self.key_scores = key_scores
self.value_scores = value_scores
return out if len(outs) == 1 else tuple(outs)
def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=None, cache=None):
def _Bi_MultiHeadAttention_forward(self,
query,
key=None,
value=None,
attn_mask=None,
cache=None):
key = query if key is None else key
value = query if value is None else value
# compute q ,k ,v
......@@ -224,25 +270,24 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=
q, k, v = self._prepare_qkv(query, key, value, cache)
else:
q, k, v, cache = self._prepare_qkv(query, key, value, cache)
# distill qxq
# distill qxq
query_scores = paddle.matmul(q, tensor.transpose(x=q, perm=[0, 1, 3, 2]))
query_scores = query_scores / math.sqrt(self.head_dim)
# distill kxk
key_scores = paddle.matmul(k, tensor.transpose(x=k, perm=[0, 1, 3, 2]))
key_scores = key_scores / math.sqrt(self.head_dim)
q = BinaryQuantizer.apply(q)
k = BinaryQuantizer.apply(k)
# scale dot product attention
# TODO(guosheng): use tensor.matmul, however it doesn't support `alpha`
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
product = paddle.matmul(x=q, y=k, transpose_y=True)
product = paddle.scale(product, scale=self.head_dim**-0.5)
if attn_mask is not None:
# Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype)
product = product + attn_mask
# weights = F.softmax(product)
weights = product
if self.dropout:
......@@ -251,7 +296,7 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=
self.dropout,
training=self.training,
mode="upscale_in_train")
# distil vxv
value_scores = paddle.matmul(v, tensor.transpose(x=v, perm=[0, 1, 3, 2]))
value_scores = value_scores / math.sqrt(self.head_dim)
......@@ -279,6 +324,7 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=
self.value_scores = value_scores
return out if len(outs) == 1 else tuple(outs)
def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None):
src_mask = _convert_attention_mask(src_mask, src.dtype)
......@@ -289,8 +335,7 @@ def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None):
if cache is None:
src = self.self_attn(src, src, src, src_mask)
else:
src, incremental_cache = self.self_attn(src, src, src, src_mask,
cache)
src, incremental_cache = self.self_attn(src, src, src, src_mask, cache)
src = residual + self.dropout1(src)
if not self.normalize_before:
......@@ -306,6 +351,7 @@ def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None):
self.rep = src
return src if cache is None else (src, incremental_cache)
def _get_attr(model, attr):
res = []
if hasattr(model, attr):
......@@ -314,6 +360,7 @@ def _get_attr(model, attr):
res.extend(_get_attr(layer, attr))
return res
def _to_distill_function(model):
from types import MethodType
for layer in model.children():
......@@ -321,6 +368,6 @@ def _to_distill_function(model):
layer.forward = MethodType(_Bi_MultiHeadAttention_forward, layer)
elif isinstance(layer, MultiHeadAttention):
layer.forward = MethodType(_MultiHeadAttention_forward, layer)
elif isinstance(layer, paddle.nn.layer.transformer.TransformerEncoderLayer):
elif isinstance(layer,
paddle.nn.layer.transformer.TransformerEncoderLayer):
layer.forward = MethodType(_TransformerEncoderLayer_forward, layer)
......@@ -8,7 +8,6 @@ import math
import time
import random
import numpy as np
import paddle.fluid as fluid
sys.path[0] = os.path.join(
os.path.dirname("__file__"), os.path.pardir, os.path.pardir)
from paddleslim.common import get_logger
......
......@@ -29,8 +29,6 @@
```text
.
├── cluster_train.py # 分布式训练函数
├── cluster_train.sh # 本地模拟多机脚本
├── train.py # 训练函数
├── infer.py # 预测脚本
├── net.py # 网络结构
......@@ -119,12 +117,6 @@ python train.py -h
OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse
```
本地单机模拟多机训练
```bash
sh cluster_train.sh
```
本示例中按照单机多线程训练的命令进行训练,训练完毕后,可看到在当前文件夹下保存模型的路径为: ``v1_cpu5_b100_lr1dir``, 运行 ``ls v1_cpu5_b100_lr1dir``可看到该文件夹下保存了训练的10个epoch的模型文件。
```
pass-0 pass-1 pass-2 pass-3 pass-4 pass-5 pass-6 pass-7 pass-8 pass-9
......
from __future__ import print_function
import argparse
import logging
import os
import time
import math
import random
import numpy as np
import paddle
import six
import reader
from net import skip_gram_word2vec
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Word2vec example")
parser.add_argument(
'--train_data_dir',
type=str,
default='./data/text',
help="The path of taining dataset")
parser.add_argument(
'--base_lr',
type=float,
default=0.01,
help="The number of learing rate (default: 0.01)")
parser.add_argument(
'--save_step',
type=int,
default=500000,
help="The number of step to save (default: 500000)")
parser.add_argument(
'--print_batch',
type=int,
default=100,
help="The number of print_batch (default: 10)")
parser.add_argument(
'--dict_path',
type=str,
default='./data/1-billion_dict',
help="The path of data dict")
parser.add_argument(
'--batch_size',
type=int,
default=500,
help="The size of mini-batch (default:500)")
parser.add_argument(
'--num_passes',
type=int,
default=10,
help="The number of passes to train (default: 10)")
parser.add_argument(
'--model_output_dir',
type=str,
default='models',
help='The path for model to store (default: models)')
parser.add_argument('--nce_num', type=int, default=5, help='nce_num')
parser.add_argument(
'--embedding_size',
type=int,
default=64,
help='sparse feature hashing space for index processing')
parser.add_argument(
'--is_sparse',
action='store_true',
required=False,
default=False,
help='embedding and nce will use sparse or not, (default: False)')
parser.add_argument(
'--with_speed',
action='store_true',
required=False,
default=False,
help='print speed or not , (default: False)')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
return parser.parse_args()
def convert_python_to_tensor(weight, batch_size, sample_reader):
def __reader__():
cs = np.array(weight).cumsum()
result = [[], []]
for sample in sample_reader():
for i, fea in enumerate(sample):
result[i].append(fea)
if len(result[0]) == batch_size:
tensor_result = []
for tensor in result:
t = paddle.fluid.Tensor()
dat = np.array(tensor, dtype='int64')
if len(dat.shape) > 2:
dat = dat.reshape((dat.shape[0], dat.shape[2]))
elif len(dat.shape) == 1:
dat = dat.reshape((-1, 1))
t.set(dat, paddle.CPUPlace())
tensor_result.append(t)
tt = paddle.fluid.Tensor()
neg_array = cs.searchsorted(np.random.sample(args.nce_num))
neg_array = np.tile(neg_array, batch_size)
tt.set(
neg_array.reshape((batch_size, args.nce_num)),
paddle.CPUPlace())
tensor_result.append(tt)
yield tensor_result
result = [[], []]
return __reader__
def train_loop(args, train_program, reader, py_reader, loss, trainer_id, weight,
lr):
py_reader.decorate_tensor_provider(
convert_python_to_tensor(weight, args.batch_size, reader.train()))
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(paddle.static.default_startup_program())
print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
train_exe = exe
for pass_id in range(args.num_passes):
py_reader.start()
time.sleep(10)
epoch_start = time.time()
batch_id = 0
start = time.time()
try:
while True:
loss_val = train_exe.run(fetch_list=[loss.name])
loss_val = np.mean(loss_val)
if batch_id % args.print_batch == 0:
logger.info(
"TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
format(pass_id, batch_id,
loss_val.mean(), py_reader.queue.size()))
if args.with_speed:
if batch_id % 500 == 0 and batch_id != 0:
elapsed = (time.time() - start)
start = time.time()
samples = 1001 * args.batch_size * int(
os.getenv("CPU_NUM"))
logger.info("Time used: {}, Samples/Sec: {}".format(
elapsed, samples / elapsed))
lr.step()
if batch_id % args.save_step == 0 and batch_id != 0:
model_dir = args.model_output_dir + '/pass-' + str(
pass_id) + ('/batch-' + str(batch_id))
if trainer_id == 0:
paddle.static.save(exe, model_dir, train_program)
print("model saved in %s" % model_dir)
batch_id += 1
except paddle.framework.core.EOFException:
py_reader.reset()
epoch_end = time.time()
logger.info("Epoch: {0}, Train total expend: {1} ".format(
pass_id, epoch_end - epoch_start))
model_dir = args.model_output_dir + '/pass-' + str(pass_id)
if trainer_id == 0:
paddle.static.save(exe, model_dir, train_program)
print("model saved in %s" % model_dir)
def GetFileList(data_path):
return os.listdir(data_path)
def train(args):
if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0:
os.mkdir(args.model_output_dir)
filelist = GetFileList(args.train_data_dir)
word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
filelist, 0, 1)
logger.info("dict_size: {}".format(word2vec_reader.dict_size))
np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
id_frequencys_pow = np_power / np_power.sum()
loss, py_reader = skip_gram_word2vec(
word2vec_reader.dict_size,
args.embedding_size,
is_sparse=args.is_sparse,
neg_num=args.nce_num)
learning_rate = paddle.optimizer.lr.ExponentialDecay(
args.base_lr, gama=0.999)
optimizer = paddle.optimizer.SGD(learning_rate=learning_rate)
optimizer.minimize(loss)
logger.info("run dist training")
t = paddle.fluid.DistributeTranspiler()
t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog = t.get_pserver_program(args.current_endpoint)
pserver_startup = t.get_startup_program(args.current_endpoint,
pserver_prog)
exe = paddle.static.Executor(paddle.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(args,
t.get_trainer_program(), word2vec_reader, py_reader, loss,
args.trainer_id, id_frequencys_pow, learning_rate)
if __name__ == '__main__':
args = parse_args()
train(args)
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
export CPU_NUM=5
export FLAGS_rpc_deadline=3000000
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1 \
--is_sparse \
--with_speed \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1 \
--is_sparse \
--with_speed \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1000 \
--is_sparse \
--with_speed \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1000 \
--is_sparse \
--with_speed \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
......@@ -89,21 +89,21 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
paddle.multiply(input_emb, true_emb_w), keepdim=True),
true_emb_b)
input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w_re, transpose_y=True)
neg_matmul = paddle.matmul(input_emb_re, neg_emb_w_re, transpose_y=True)
neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num])
neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec)
#nce loss
# TODO: replaced by paddle.tensor.creation.fill_constant_batch_size_like
label_ones = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, 1], value=1.0, dtype='float32')
label_zeros = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
label_ones)
neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits,
label_zeros)
true_xent = paddle.nn.functional.binary_cross_entropy(true_logits,
label_ones)
neg_xent = paddle.nn.functional.binary_cross_entropy(neg_logits,
label_zeros)
cost = paddle.add(paddle.sum(true_xent, axis=1),
paddle.sum(neg_xent, axis=1))
avg_cost = paddle.mean(cost)
......@@ -133,7 +133,7 @@ def infer_network(vocab_size, emb_size):
emb_c = paddle.static.nn.embedding(
input=analogy_c, size=[vocab_size, emb_size], param_attr="emb")
target = paddle.add(paddle.add(emb_b, -emb_a), emb_c)
emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
dist = fluid.layers.matmul(x=target, y=emb_all_label_l2, transpose_y=True)
emb_all_label_l2 = paddle.linalg.norm(emb_all_label, p=2, axis=1)
dist = paddle.matmul(x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = paddle.topk(x=dist, k=4)
return values, pred_idx
......@@ -97,7 +97,7 @@ def convert_python_to_tensor(weight, batch_size, sample_reader):
if len(result[0]) == batch_size:
tensor_result = []
for tensor in result:
t = paddle.fluid.Tensor()
t = paddle.Tensor()
dat = np.array(tensor, dtype='int64')
if len(dat.shape) > 2:
dat = dat.reshape((dat.shape[0], dat.shape[2]))
......@@ -105,7 +105,7 @@ def convert_python_to_tensor(weight, batch_size, sample_reader):
dat = dat.reshape((-1, 1))
t.set(dat, paddle.CPUPlace())
tensor_result.append(t)
tt = paddle.fluid.Tensor()
tt = paddle.Tensor()
neg_array = cs.searchsorted(np.random.sample(args.nce_num))
neg_array = np.tile(neg_array, batch_size)
tt.set(
......
......@@ -66,8 +66,7 @@ def compress(args):
def if_exist(var):
return os.path.exists(os.path.join(args.pretrained_model, var.name))
paddle.fluid.io.load_vars(
exe, args.pretrained_model, predicate=if_exist)
paddle.static.load_vars(exe, args.pretrained_model, predicate=if_exist)
valid_loader = paddle.io.DataLoader(
val_dataset,
......
......@@ -20,7 +20,6 @@ else:
import imageio as imgreader
import os
import paddle
from paddle import fluid
class CASIA_Face(object):
......@@ -79,19 +78,17 @@ if __name__ == '__main__':
data_dir = 'PATH to CASIA dataset'
place = paddle.CPUPlace()
with fluid.dygraph.guard(place):
dataset = CASIA_Face(root=data_dir)
print(len(dataset))
print(dataset.class_nums)
trainloader = paddle.fluid.io.batch(
dataset.reader, batch_size=1, drop_last=False)
for i in range(10):
for data in trainloader():
img = np.array([x[0] for x in data]).astype('float32')
img = fluid.dygraph.to_variable(img)
print(img.shape)
label = np.array([x[1] for x in data]).astype('int64').reshape(
-1, 1)
label = fluid.dygraph.to_variable(label)
print(label.shape)
print(len(dataset))
dataset = CASIA_Face(root=data_dir)
print(len(dataset))
print(dataset.class_nums)
trainloader = paddle.batch(dataset.reader, batch_size=1, drop_last=False)
for i in range(10):
for data in trainloader():
img = np.array([x[0] for x in data]).astype('float32')
img = paddle.to_tensor(img)
print(img.shape)
label = np.array([x[1] for x in data]).astype('int64').reshape(-1,
1)
label = paddle.to_tensor(label)
print(label.shape)
print(len(dataset))
......@@ -18,8 +18,6 @@ if six.PY2:
import scipy.misc as imgreader
else:
import imageio as imgreader
import paddle
from paddle import fluid
class LFW(object):
......
......@@ -19,8 +19,6 @@ import scipy.io
import numpy as np
import paddle
from paddle import fluid
from dataloader.casia import CASIA_Face
from dataloader.lfw import LFW
from paddleslim import models
......@@ -116,10 +114,7 @@ def test(test_reader, flods, flags, net, args):
data_list[1].append(data[_][1])
data_list[2].append(data[_][2])
data_list[3].append(data[_][3])
res = [
net(fluid.dygraph.to_variable(np.array(d))).numpy()
for d in data_list
]
res = [net(paddle.to_tensor(np.array(d))).numpy() for d in data_list]
featureL = np.concatenate((res[0], res[1]), 1)
featureR = np.concatenate((res[2], res[3]), 1)
if featureLs is None:
......@@ -154,21 +149,18 @@ if __name__ == "__main__":
args = parser.parse_args()
place = paddle.CPUPlace() if args.use_gpu == 0 else paddle.CUDAPlace(0)
with fluid.dygraph.guard(place):
train_dataset = CASIA_Face(root=args.train_data_dir)
nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr)
test_reader = paddle.fluid.io.batch(
test_dataset.reader,
batch_size=args.test_batchsize,
drop_last=False)
net = models.__dict__[args.model](class_dim=train_dataset.class_nums)
if args.resume:
assert os.path.exists(args.resume + ".pdparams"
), "Given dir {}.pdparams not exist.".format(
args.resume)
para_dict, opti_dict = fluid.dygraph.load_dygraph(args.resume)
net.set_dict(para_dict)
test(test_reader, flods, flags, net, args)
train_dataset = CASIA_Face(root=args.train_data_dir)
nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr)
test_reader = paddle.batch(
test_dataset.reader, batch_size=args.test_batchsize, drop_last=False)
net = models.__dict__[args.model](class_dim=train_dataset.class_nums)
if args.resume:
assert os.path.exists(
args.resume +
".pdparams"), "Given dir {}.pdparams not exist.".format(args.resume)
para_dict, opti_dict = paddle.load(args.resume)
net.set_dict(para_dict)
test(test_reader, flods, flags, net, args)
......@@ -21,7 +21,6 @@ import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.compiler as compiler
from dataloader.casia import CASIA_Face
from dataloader.lfw import LFW
......@@ -46,19 +45,19 @@ def creat_optimizer(args, trainset_scale):
]
lr = [float(e) for e in args.lr_list.strip().split(',')]
assert len(bd) == len(lr) - 1
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
optimizer = paddle.optimizer.Momentum(
learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(args.l2_decay))
weight_decay=args.l2_decay)
elif args.lr_strategy == 'cosine_decay':
lr = args.lr
step_each_epoch = trainset_scale // args.train_batchsize
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(lr, step_each_epoch,
args.total_epoch),
optimizer = paddle.optimizer.Momentum(
learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
lr, args.total_epoch / 2),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(args.l2_decay))
weight_decay=args.l2_decay)
else:
print('Wrong learning rate strategy')
exit()
......@@ -117,9 +116,9 @@ def test(test_exe, test_program, test_out, args):
def train(exe, train_program, train_out, test_program, test_out, args):
loss, acc, global_lr, train_reader = train_out
fetch_list_train = [loss.name, acc.name, global_lr.name]
build_strategy = fluid.BuildStrategy()
build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_optimizer_ops = True
compiled_prog = compiler.CompiledProgram(
compiled_prog = paddle.static.CompiledProgram(
train_program, build_strategy=build_strategy).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
best_ave = 0
......@@ -136,8 +135,7 @@ def train(exe, train_program, train_out, test_program, test_out, args):
float(np.mean(np.array(global_lr)))))
if batch_id % args.save_frequency == 0:
model_path = os.path.join(args.save_ckpt, str(epoch_id))
fluid.io.save_persistables(
executor=exe, dirname=model_path, main_program=train_program)
paddle.static.save(train_program, model_path)
temp_ave = test(exe, test_program, test_out, args)
if temp_ave > best_ave:
best_ave = temp_ave
......@@ -171,11 +169,11 @@ def build_program(program, startup, args, is_train=True):
name='image', shape=[-1, 3, 112, 96], dtype='float32')
label = paddle.static.data(
name='label', shape=[-1, 1], dtype='int64')
train_reader = fluid.io.batch(
train_reader = paddle.batch(
train_dataset.reader,
batch_size=args.train_batchsize // num_trainers,
drop_last=False)
reader = fluid.io.DataLoader.from_generator(
reader = paddle.io.DataLoader.from_generator(
feed_list=[image, label],
capacity=64,
iterable=True,
......@@ -192,7 +190,7 @@ def build_program(program, startup, args, is_train=True):
else:
nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr)
test_reader = fluid.io.batch(
test_reader = paddle.batch(
test_dataset.reader,
batch_size=args.test_batchsize,
drop_last=False)
......@@ -206,7 +204,7 @@ def build_program(program, startup, args, is_train=True):
name='image_test3', shape=[-1, 3, 112, 96], dtype='float32')
image_test4 = paddle.static.data(
name='image_test4', shape=[-1, 3, 112, 96], dtype='float32')
reader = fluid.io.DataLoader.from_generator(
reader = paddle.io.DataLoader.from_generator(
feed_list=[
image_test1, image_test2, image_test3, image_test4
],
......@@ -228,7 +226,7 @@ def build_program(program, startup, args, is_train=True):
def quant_val_reader_batch():
nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr)
test_reader = fluid.io.batch(
test_reader = paddle.batch(
test_dataset.reader, batch_size=1, drop_last=False)
shuffle_reader = fluid.io.shuffle(test_reader, 3)
......@@ -296,7 +294,7 @@ def main():
args = parser.parse_args()
if args.use_gpu:
num_trainers = paddle.fluid.core.get_cuda_device_count()
num_trainers = paddle.framework.core.get_cuda_device_count()
else:
num_trainers = int(os.environ.get('CPU_NUM', 1))
print(args)
......@@ -345,7 +343,7 @@ def main():
executor=exe)
nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr)
test_reader = fluid.io.batch(
test_reader = paddle.batch(
test_dataset.reader,
batch_size=args.test_batchsize,
drop_last=False)
......@@ -359,7 +357,7 @@ def main():
name='image_test3', shape=[-1, 3, 112, 96], dtype='float32')
image_test4 = paddle.static.data(
name='image_test4', shape=[-1, 3, 112, 96], dtype='float32')
reader = fluid.io.DataLoader.from_generator(
reader = paddle.io.DataLoader.from_generator(
feed_list=[image_test1, image_test2, image_test3, image_test4],
capacity=64,
iterable=True,
......
......@@ -7,7 +7,6 @@ import functools
import math
import time
import numpy as np
import paddle.fluid as fluid
sys.path.append(os.path.join(os.path.dirname("__file__"), os.path.pardir))
from paddleslim.prune.unstructured_pruner import UnstructuredPruner
from paddleslim.common import get_logger
......@@ -90,7 +89,7 @@ def compress(args):
return os.path.exists(os.path.join(args.pruned_model, var.name))
_logger.info("Load pruned model from {}".format(args.pruned_model))
paddle.fluid.io.load_vars(exe, args.pruned_model, predicate=if_exist)
paddle.static.load_vars(exe, args.pruned_model, predicate=if_exist)
def test(epoch, program):
acc_top1_ns = []
......
......@@ -7,15 +7,14 @@ import functools
import time
import random
import numpy as np
import paddle.fluid as fluid
from paddleslim.prune.unstructured_pruner import UnstructuredPruner, GMPUnstructuredPruner
from paddleslim.common import get_logger
sys.path.append(os.path.join(os.path.dirname("__file__"), os.path.pardir))
import models
from utility import add_arguments, print_arguments
import paddle.vision.transforms as T
from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
from paddle.fluid.incubate.fleet.base import role_maker
from paddle.distributed import fleet
from paddle.distributed.fleet import DistributedStrategy
_logger = get_logger(__name__, level=logging.INFO)
......@@ -133,7 +132,7 @@ def compress(args):
if use_data_parallel:
# Fleet step 1: initialize the distributed environment
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
role = fleet.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
train_reader = None
......@@ -225,7 +224,7 @@ def compress(args):
if use_data_parallel:
dist_strategy = DistributedStrategy()
dist_strategy.sync_batch_norm = False
dist_strategy.exec_strategy = paddle.static.ExecutionStrategy()
dist_strategy.execution_strategy = paddle.static.ExecutionStrategy()
dist_strategy.fuse_all_reduce_ops = False
train_program = paddle.static.default_main_program()
......@@ -256,8 +255,7 @@ def compress(args):
if args.last_epoch > -1:
assert args.checkpoint is not None and os.path.exists(
args.checkpoint), "Please specify a valid checkpoint path."
paddle.fluid.io.load_persistables(
executor=exe, dirname=args.checkpoint, main_program=train_program)
paddle.static.load(train_program, args.checkpoint)
elif args.pretrained_model:
assert os.path.exists(
......@@ -270,10 +268,9 @@ def compress(args):
_logger.info("Load pretrained model from {}".format(
args.pretrained_model))
# NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API.
# NOTE: We are using paddle.static.load_vars() because the pretrained model is from an older version which requires this API.
# Please consider using paddle.static.load(program, model_path) when possible
paddle.fluid.io.load_vars(
exe, args.pretrained_model, predicate=if_exist)
paddle.static.load_vars(exe, args.pretrained_model, predicate=if_exist)
def test(epoch, program):
acc_top1_ns = []
......@@ -336,12 +333,8 @@ def compress(args):
learning_rate.step()
reader_start = time.time()
if use_data_parallel:
# Fleet step 4: get the compiled program from fleet
compiled_train_program = fleet.main_program
else:
compiled_train_program = paddle.static.CompiledProgram(
paddle.static.default_main_program())
compiled_train_program = paddle.static.CompiledProgram(
paddle.static.default_main_program())
for i in range(args.last_epoch + 1, args.num_epochs):
train(i, compiled_train_program)
......@@ -358,8 +351,8 @@ def compress(args):
if use_data_parallel:
fleet.save_persistables(executor=exe, dirname=args.model_path)
else:
paddle.fluid.io.save_persistables(
executor=exe, dirname=args.model_path)
paddle.static.save(paddle.static.default_main_program(),
args.model_path)
def main():
......
......@@ -198,9 +198,9 @@ class TableLatencyPredictor(LatencyPredictor):
paddle.enable_static()
with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read())
_program = paddle.static.Program.parse_from_string(f.read())
graph = GraphWrapper(fluid_program)
graph = GraphWrapper(_program)
if input_shape != None:
ori_shape = self._get_input_shape(graph)
......
......@@ -23,7 +23,7 @@ def model_size(program):
Get total value numbers of all parameters.
Args:
program(fluid.Program): The program used to calculate model size.
program(paddle.static.Program): The program used to calculate model size.
Returns:
int: The total count of all parameters.
......
......@@ -432,8 +432,8 @@ class ProgramInfo:
"""
ProgramInfo Config.
Args:
startup_program(paddle.static.Program): Startup program, the means of startup program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_startup_program_cn.html#cn-api-fluid-default-startup-program>`_.
program(paddle.static.Program): main program, the means of main program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_main_program_cn.html#cn-api-fluid-default-main-program>`_.
startup_program(paddle.static.Program): Startup program, the means of startup program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_startup_program_cn.html#default-startup-program>`_.
program(paddle.static.Program): main program, the means of main program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_main_program_cn.html#default-main-program>`_.
feed_target_names(list(str)): The name of feed tensor in the program.
fetch_targets(list(Variable)): The fetch variable in the program.
optimizer(Optimizer, optional): Optimizer in training. Default: None.
......
......@@ -57,15 +57,12 @@ def _recover_param_attr(program):
Params in infermodel are stored in the form of variable, which can not be trained."""
all_weights = [param for param in program.list_vars() \
if param.persistable is True and param.name != 'feed' and param.name != 'fetch']
for w in all_weights:
new_w = paddle.fluid.framework.Parameter(
block=program.block(0),
shape=w.shape,
dtype=w.dtype,
type=w.type,
name=w.name)
new_w.set_value(w.get_value())
program.block(0).vars[w.name] = new_w
with paddle.static.program_guard(program):
for w in all_weights:
new_w = paddle.create_parameter(
shape=w.shape, dtype=w.dtype, name=w.name)
new_w.set_value(w.get_value())
program.block(0).vars[w.name] = new_w
return program
......
......@@ -16,31 +16,35 @@ import math
import logging
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid import ParamAttr
from paddle.fluid.layers import RNNCell, LSTMCell, rnn
from paddle.fluid.contrib.layers import basic_lstm
from paddle.nn import LSTMCell
from ...controller import RLBaseController
from ...log_helper import get_logger
from ..utils import RLCONTROLLER
_logger = get_logger(__name__, level=logging.INFO)
uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x)
uniform_initializer = lambda x: paddle.nn.initializer.Uniform(low=-x, high=x)
class lstm_cell(RNNCell):
class lstm_cell(paddle.nn.RNNCellBase):
def __init__(self, num_layers, hidden_size):
self.num_layers = num_layers
self.hidden_size = hidden_size
self.lstm_cells = []
param_attr = ParamAttr(initializer=uniform_initializer(
param_attr = paddle.ParamAttr(initializer=uniform_initializer(
1.0 / math.sqrt(hidden_size)))
bias_attr = ParamAttr(initializer=uniform_initializer(
bias_attr = paddle.ParamAttr(initializer=uniform_initializer(
1.0 / math.sqrt(hidden_size)))
for i in range(num_layers):
self.lstm_cells.append(LSTMCell(hidden_size, param_attr, bias_attr))
self.lstm_cells.append(
LSTMCell(
hidden_size,
hidden_size,
weight_ih_attr=param_attr,
weight_hh_attr=param_attr,
bias_ih_attr=bias_attr,
bias_hh_attr=bias_attr))
def call(self, inputs, states):
new_states = []
......@@ -100,7 +104,7 @@ class LSTM(RLBaseController):
shape=(self.controller_batch_size, self.hidden_size),
dtype='float32',
default_initializer=uniform_initializer(1.0))
self.baseline = fluid.layers.create_global_var(
self.baseline = paddle.static.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
......@@ -134,7 +138,10 @@ class LSTM(RLBaseController):
action = paddle.squeeze(action, axis=[1])
action.stop_gradient = True
else:
action = fluid.layers.sampling_id(probs)
multinomial = paddle.distribution.Multinomial(1, probs)
action = paddle.argmax(
multinomial.sample((1, )), axis=-1)
action = paddle.flatten(action)
actions.append(action)
log_prob = paddle.nn.functional.softmax_with_cross_entropy(
logits,
......@@ -171,22 +178,25 @@ class LSTM(RLBaseController):
dtype='float32',
default_initializer=uniform_initializer(1.0))
paddle.assign(
fluid.layers.uniform_random(shape=self.g_emb.shape), self.g_emb)
hidden = fluid.data(name='hidden', shape=[None, self.hidden_size])
cell = fluid.data(name='cell', shape=[None, self.hidden_size])
paddle.assign(paddle.uniform(shape=self.g_emb.shape), self.g_emb)
hidden = paddle.static.data(
name='hidden', shape=[None, self.hidden_size])
cell = paddle.static.data(
name='cell', shape=[None, self.hidden_size])
self.tokens = self._network(hidden, cell, is_inference=is_inference)
with paddle.static.program_guard(self.learn_program):
hidden = fluid.data(name='hidden', shape=[None, self.hidden_size])
cell = fluid.data(name='cell', shape=[None, self.hidden_size])
init_actions = fluid.data(
hidden = paddle.static.data(
name='hidden', shape=[None, self.hidden_size])
cell = paddle.static.data(
name='cell', shape=[None, self.hidden_size])
init_actions = paddle.static.data(
name='init_actions',
shape=[None, len(self.range_tables)],
dtype='int64')
self._network(hidden, cell, init_actions=init_actions)
rewards = fluid.data(name='rewards', shape=[None])
rewards = paddle.static.data(name='rewards', shape=[None])
self.rewards = paddle.mean(rewards)
if self.weight_entropy is not None:
......@@ -197,7 +207,7 @@ class LSTM(RLBaseController):
paddle.assign(self.baseline - (1.0 - self.decay) *
(self.baseline - self.rewards), self.baseline)
self.loss = self.sample_log_probs * (self.rewards - self.baseline)
clip = fluid.clip.GradientClipByNorm(clip_norm=5.0)
clip = paddle.nn.ClipGradByNorm(clip_norm=5.0)
if self.decay_steps is not None:
lr = paddle.optimizer.lr.ExponentialDecay(
learning_rate=self.controller_lr,
......@@ -287,4 +297,4 @@ class LSTM(RLBaseController):
_logger.info("Controller: current reward is {}, loss is {}".format(
rewards, loss))
params_dict = self.get_params(self.learn_program)
return params_dict
\ No newline at end of file
return params_dict
......@@ -94,7 +94,6 @@ def to_variables(inputs):
return ret
@paddle.fluid.framework.dygraph_only
def dygraph2program(layer, inputs, dtypes=None):
assert isinstance(layer, paddle.nn.Layer)
return _dy2prog(layer, inputs, dtypes)
......
......@@ -220,7 +220,7 @@ class OpWrapper(object):
class GraphWrapper(object):
"""
It is a wrapper of paddle.fluid.framework.IrGraph with some special functions
It is a wrapper of paddle.framework.IrGraph with some special functions
for paddle slim framework.
Args:
......
......@@ -189,7 +189,7 @@ class DARTSearch(object):
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
self.learning_rate, self.num_epochs // 2)
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
clip = paddle.nn.ClipGradByGlobalNorm(5.0)
optimizer = paddle.optimizer.Momentum(
learning_rate,
0.9,
......
......@@ -1024,7 +1024,7 @@ class SuperBatchNorm2D(paddle.nn.BatchNorm2D):
return batch_norm_out
paddle.fluid.data_feeder.check_variable_and_dtype(
paddle.common_ops_import.check_variable_and_dtype(
input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm')
# for static need dict
......@@ -1111,7 +1111,7 @@ class SuperSyncBatchNorm(paddle.nn.SyncBatchNorm):
"use_mkldnn", False, "fuse_with_relu", False,
"use_global_stats", False, 'trainable_statistics', False)
if paddle.fluid.framework._non_static_mode():
if paddle.in_dynamic_mode():
if feature_dim != self._mean.shape[0]:
sync_batch_norm_out, _, _, _, _, _ = paddle._legacy_C_ops.sync_batch_norm(
input, weight, bias, self._mean, self._variance, mean_out,
......@@ -1128,10 +1128,7 @@ class SuperSyncBatchNorm(paddle.nn.SyncBatchNorm):
return sync_batch_norm_out
print(
f"hit static check_variable_and_dtype in ofa-----------------------------------"
)
paddle.fluid.data_feeder.check_variable_and_dtype(
paddle.common_ops_import.check_variable_and_dtype(
input, 'input', ['float16', 'float32', 'float64'], 'SyncBatchNorm')
attrs = {
......@@ -1308,7 +1305,7 @@ class SuperLayerNorm(paddle.nn.LayerNorm):
out, _, _ = paddle._C_ops.layer_norm(
input, weight, bias, self._epsilon, begin_norm_axis, False)
else:
paddle.fluid.data_feeder.check_variable_and_dtype(
paddle.common_ops_import.check_variable_and_dtype(
input, 'input', ['float32', 'float64'], 'LayerNorm')
inputs = dict()
......
......@@ -17,7 +17,7 @@ from .mobilenetv1 import MobileNetV1Space
from .resnet import ResNetSpace
from .mobilenet_block import MobileNetV1BlockSpace, MobileNetV2BlockSpace
from .resnet_block import ResNetBlockSpace
from .inception_block import InceptionABlockSpace, InceptionCBlockSpace
from .inception_block import InceptionABlockSpace
from .darts_space import DartsSpace
from .search_space_registry import SEARCHSPACE
from .search_space_factory import SearchSpaceFactory
......@@ -25,6 +25,6 @@ from .search_space_base import SearchSpaceBase
__all__ = [
'MobileNetV1Space', 'MobileNetV2Space', 'ResNetSpace', 'DartsSpace',
'MobileNetV1BlockSpace', 'MobileNetV2BlockSpace', 'ResNetBlockSpace',
'InceptionABlockSpace', 'InceptionCBlockSpace', 'SearchSpaceBase',
'SearchSpaceFactory', 'SEARCHSPACE'
'InceptionABlockSpace', 'SearchSpaceBase', 'SearchSpaceFactory',
'SEARCHSPACE'
]
......@@ -107,8 +107,7 @@ class DartsSpace(SearchSpaceBase):
return net_arch
def _classifier(self, x, num_classes, name):
out = paddle.fluid.layers.pool2d(
x, pool_type='avg', global_pooling=True)
out = paddle.nn.functional.adaptive_avg_pool2d(x, 1)
out = paddle.squeeze(x=out, axis=[2, 3])
k = (1. / out.shape[1])**0.5
out = paddle.static.nn.fc(out,
......@@ -125,8 +124,7 @@ class DartsSpace(SearchSpaceBase):
def _auxiliary_cifar(self, x, num_classes, name):
x = paddle.nn.functional.relu(x)
pooled = paddle.fluid.layers.pool2d(
x, pool_size=5, pool_stride=3, pool_padding=0, pool_type='avg')
pooled = paddle.nn.functional.avg_pool2d(x, 5, stride=3, padding=0)
conv1 = self._conv_bn(
x=pooled,
c_out=128,
......@@ -309,13 +307,8 @@ class DartsSpace(SearchSpaceBase):
drop_path_cell,
is_train,
name=None):
hidden0_0 = paddle.fluid.layers.pool2d(
input=s0,
pool_size=3,
pool_type="max",
pool_stride=2,
pool_padding=1,
name=name + '_reduction_cell_hidden0_0')
hidden0_0 = paddle.nn.functional.max_pool2d(
s0, 3, stride=2, padding=1, name=name + '_reduction_cell_hidden0_0')
hidden0_1 = self._factorized_reduce(
s1,
filter_num,
......@@ -328,14 +321,8 @@ class DartsSpace(SearchSpaceBase):
drop_path_cell[:, 0, 0],
name=name + '_reduction_cell_hidden0_0')
r0 = hidden0_0 + hidden0_1
hidden1_0 = paddle.fluid.layers.pool2d(
input=s1,
pool_size=3,
pool_type="max",
pool_stride=2,
pool_padding=1,
name=name + '_reduction_cell_hidden1_0')
hidden1_0 = paddle.nn.functional.max_pool2d(
s1, 3, stride=2, padding=1, name=name + '_reduction_cell_hidden1_0')
hidden1_1 = r0
if is_train:
hidden1_0 = self._drop_path(
......@@ -364,13 +351,8 @@ class DartsSpace(SearchSpaceBase):
r2 = hidden2_0 + hidden2_1
hidden3_0 = r0
hidden3_1 = paddle.fluid.layers.pool2d(
input=s1,
pool_size=3,
pool_type="max",
pool_stride=2,
pool_padding=1,
name=name + '_reduction_cell_hidden3_1')
hidden3_1 = paddle.nn.functional.max_pool2d(
s1, 3, stride=2, padding=1, name=name + '_reduction_cell_hidden3_1')
if is_train:
hidden3_1 = self._drop_path(
hidden3_1,
......
......@@ -193,13 +193,9 @@ class InceptionABlockSpace(SearchSpaceBase):
stride,
pool_type,
name=None):
print(f"hit _inceptionA----------------------------")
pool1 = paddle.fluid.layers.pool2d(
input=data,
pool_size=filter_size,
pool_padding='SAME',
pool_type=pool_type,
name=name + '_pool2d')
pool_op = paddle.nn.functional.avg_pool2d if pool_type == "avg" else paddle.nn.functional.max_pool2d
pool1 = pool_op(
data, filter_size, padding='SAME', stride=1, name=name + '_pool2d')
conv1 = conv_bn_layer(
input=pool1,
filter_size=1,
......@@ -256,258 +252,3 @@ class InceptionABlockSpace(SearchSpaceBase):
concat = paddle.concat(
[conv1, conv2, conv3, conv4], axis=1, name=name + '_concat')
return concat
@SEARCHSPACE.register
class InceptionCBlockSpace(SearchSpaceBase):
def __init__(self, input_size, output_size, block_num, block_mask):
super(InceptionCBlockSpace, self).__init__(input_size, output_size,
block_num, block_mask)
if self.block_mask == None:
# use input_size and output_size to compute self.downsample_num
self.downsample_num = compute_downsample_num(self.input_size,
self.output_size)
if self.block_num != None:
assert self.downsample_num <= self.block_num, 'downsample numeber must be LESS THAN OR EQUAL TO block_num, but NOW: downsample numeber is {}, block_num is {}'.format(
self.downsample_num, self.block_num)
### self.filter_num means filter nums
self.filter_num = np.array([
3, 4, 8, 12, 16, 24, 32, 48, 64, 80, 96, 128, 144, 160, 192, 224,
256, 320, 384, 448, 480, 512, 1024
])
### self.k_size means kernel_size
self.k_size = np.array([3, 5])
### self.pool_type means pool type, 0 means avg, 1 means max
self.pool_type = np.array([0, 1])
### self.repeat means repeat of 1x1 conv in branch of inception
### self.repeat = np.array([0,1])
def init_tokens(self):
"""
The initial token.
"""
return get_random_tokens(self.range_table())
def range_table(self):
"""
Get range table of current search space, constrains the range of tokens.
"""
range_table_base = []
if self.block_mask != None:
range_table_length = len(self.block_mask)
else:
range_table_length = self.block_num
for i in range(range_table_length):
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.k_size))
range_table_base.append(len(self.pool_type))
return range_table_base
def token2arch(self, tokens=None):
"""
return net_arch function
"""
#assert self.block_num
if tokens is None:
tokens = self.init_tokens()
self.bottleneck_params_list = []
if self.block_mask != None:
for i in range(len(self.block_mask)):
self.bottleneck_params_list.append(
(self.filter_num[tokens[i * 11]],
self.filter_num[tokens[i * 11 + 1]],
self.filter_num[tokens[i * 11 + 2]],
self.filter_num[tokens[i * 11 + 3]],
self.filter_num[tokens[i * 11 + 4]],
self.filter_num[tokens[i * 11 + 5]],
self.filter_num[tokens[i * 11 + 6]],
self.filter_num[tokens[i * 11 + 7]],
self.filter_num[tokens[i * 11 + 8]],
self.k_size[tokens[i * 11 + 9]], 2 if self.block_mask == 1
else 1, self.pool_type[tokens[i * 11 + 10]]))
else:
repeat_num = int(self.block_num / self.downsample_num)
num_minus = self.block_num % self.downsample_num
### if block_num > downsample_num, add stride=1 block at last (block_num-downsample_num) layers
for i in range(self.downsample_num):
self.bottleneck_params_list.append(
(self.filter_num[tokens[i * 11]],
self.filter_num[tokens[i * 11 + 1]],
self.filter_num[tokens[i * 11 + 2]],
self.filter_num[tokens[i * 11 + 3]],
self.filter_num[tokens[i * 11 + 4]],
self.filter_num[tokens[i * 11 + 5]],
self.filter_num[tokens[i * 11 + 6]],
self.filter_num[tokens[i * 11 + 7]],
self.filter_num[tokens[i * 11 + 8]],
self.k_size[tokens[i * 11 + 9]], 2,
self.pool_type[tokens[i * 11 + 10]]))
### if block_num / downsample_num > 1, add (block_num / downsample_num) times stride=1 block
for k in range(repeat_num - 1):
kk = k * self.downsample_num + i
self.bottleneck_params_list.append(
(self.filter_num[tokens[kk * 11]],
self.filter_num[tokens[kk * 11 + 1]],
self.filter_num[tokens[kk * 11 + 2]],
self.filter_num[tokens[kk * 11 + 3]],
self.filter_num[tokens[kk * 11 + 4]],
self.filter_num[tokens[kk * 11 + 5]],
self.filter_num[tokens[kk * 11 + 6]],
self.filter_num[tokens[kk * 11 + 7]],
self.filter_num[tokens[kk * 11 + 8]],
self.k_size[tokens[kk * 11 + 9]], 1,
self.pool_type[tokens[kk * 11 + 10]]))
if self.downsample_num - i <= num_minus:
j = self.downsample_num * (repeat_num - 1) + i
self.bottleneck_params_list.append(
(self.filter_num[tokens[j * 11]],
self.filter_num[tokens[j * 11 + 1]],
self.filter_num[tokens[j * 11 + 2]],
self.filter_num[tokens[j * 11 + 3]],
self.filter_num[tokens[j * 11 + 4]],
self.filter_num[tokens[j * 11 + 5]],
self.filter_num[tokens[j * 11 + 6]],
self.filter_num[tokens[j * 11 + 7]],
self.filter_num[tokens[j * 11 + 8]],
self.k_size[tokens[j * 11 + 9]], 1,
self.pool_type[tokens[j * 11 + 10]]))
if self.downsample_num == 0 and self.block_num != 0:
for i in range(len(self.block_num)):
self.bottleneck_params_list.append(
(self.filter_num[tokens[i * 11]],
self.filter_num[tokens[i * 11 + 1]],
self.filter_num[tokens[i * 11 + 2]],
self.filter_num[tokens[i * 11 + 3]],
self.filter_num[tokens[i * 11 + 4]],
self.filter_num[tokens[i * 11 + 5]],
self.filter_num[tokens[i * 11 + 6]],
self.filter_num[tokens[i * 11 + 7]],
self.filter_num[tokens[i * 11 + 8]],
self.k_size[tokens[i * 11 + 9]], 1,
self.pool_type[tokens[i * 11 + 10]]))
def net_arch(input, return_mid_layer=False, return_block=None):
layer_count = 0
mid_layer = dict()
for i, layer_setting in enumerate(self.bottleneck_params_list):
filter_nums = layer_setting[0:9]
filter_size = layer_setting[9]
stride = layer_setting[10]
pool_type = 'avg' if layer_setting[11] == 0 else 'max'
if stride == 2:
layer_count += 1
if check_points((layer_count - 1), return_block):
mid_layer[layer_count - 1] = input
input = self._inceptionC(
input,
C_tokens=filter_nums,
filter_size=int(filter_size),
stride=stride,
pool_type=pool_type,
name='inceptionC_{}'.format(i + 1))
if return_mid_layer:
return input, mid_layer
else:
return input,
return net_arch
def _inceptionC(self,
data,
C_tokens,
filter_size,
stride,
pool_type,
name=None):
pool1 = paddle.fluid.layers.pool2d(
input=data,
pool_size=filter_size,
pool_padding='SAME',
pool_type=pool_type,
name=name + '_pool2d')
conv1 = conv_bn_layer(
input=pool1,
filter_size=1,
num_filters=C_tokens[0],
stride=stride,
act='relu',
name=name + '_conv1')
conv2 = conv_bn_layer(
input=data,
filter_size=1,
num_filters=C_tokens[1],
stride=stride,
act='relu',
name=name + '_conv2')
conv3 = conv_bn_layer(
input=data,
filter_size=1,
num_filters=C_tokens[2],
stride=1,
act='relu',
name=name + '_conv3_1')
conv3_1 = conv_bn_layer(
input=conv3,
filter_size=filter_size,
num_filters=C_tokens[3],
stride=stride,
act='relu',
name=name + '_conv3_2_1')
conv3_2 = conv_bn_layer(
input=conv3,
filter_size=filter_size,
num_filters=C_tokens[4],
stride=stride,
act='relu',
name=name + '_conv3_2_2')
conv4 = conv_bn_layer(
input=data,
filter_size=1,
num_filters=C_tokens[5],
stride=1,
act='relu',
name=name + '_conv4_1')
conv4 = conv_bn_layer(
input=conv4,
filter_size=filter_size,
num_filters=C_tokens[6],
stride=1,
act='relu',
name=name + '_conv4_2')
conv4_1 = conv_bn_layer(
input=conv4,
filter_size=filter_size,
num_filters=C_tokens[7],
stride=stride,
act='relu',
name=name + '_conv4_3_1')
conv4_2 = conv_bn_layer(
input=conv4,
filter_size=filter_size,
num_filters=C_tokens[8],
stride=stride,
act='relu',
name=name + '_conv4_3_2')
concat = paddle.concat(
[conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2],
axis=1,
name=name + '_concat')
return concat
......@@ -196,11 +196,8 @@ class MobileNetV1Space(SearchSpaceBase):
if check_points(layer_count, end_points):
return input, decode_ends
input = paddle.fluid.layers.pool2d(
input=input,
pool_type='avg',
global_pooling=True,
name='mobilenetv1_last_pool')
input = paddle.nn.functional.adaptive_avg_pool2d(
input, 1, name='mobilenetv1_last_pool')
return input
......
......@@ -203,11 +203,8 @@ class MobileNetV2Space(SearchSpaceBase):
act='relu6',
name='mobilenetv2_conv' + str(i + 1))
input = paddle.fluid.layers.pool2d(
input=input,
pool_type='avg',
global_pooling=True,
name='mobilenetv2_last_pool')
input = paddle.nn.functional.adaptive_avg_pool2d(
input, 1, name='mobilenetv2_last_pool')
return input
......
......@@ -796,13 +796,12 @@ def pact(x, name=None):
u_param_attr = paddle.ParamAttr(
name=x.name + '_pact',
initializer=paddle.nn.initializer.Constant(value=init_thres),
regularizer=paddle.fluid.regularizer.L2Decay(0.0001),
regularizer=paddle.regularizer.L2Decay(0.0001),
learning_rate=1)
u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype)
x = paddle.subtract(x,
paddle.nn.functional.relu(paddle.subtract(x, u_param)))
x = paddle.paddle.add(
x, paddle.nn.functional.relu(paddle.subtract(-u_param, x)))
x = paddle.add(x, paddle.nn.functional.relu(paddle.subtract(-u_param, x)))
return x
......
......@@ -182,16 +182,18 @@ class TestPruningMul(unittest.TestCase):
for param in net.parameters():
if param.name not in shapes:
shapes[param.name] = param.shape
print(
f"name {param.name}: {param.shape}, excepted: {shapes[param.name]}"
)
self.assertTrue(shapes[param.name] == param.shape)
pruner.restore()
paddle.enable_static()
def add_cases(suite):
suite.addTest(TestStatus())
suite.addTest(TestFilterPruner(param_names=["conv2d_0.w_0"]))
suite.addTest(TestPruningGroupConv2d())
# suite.addTest(TestStatus())
# suite.addTest(TestFilterPruner(param_names=["conv2d_0.w_0"]))
# suite.addTest(TestPruningGroupConv2d())
suite.addTest(TestPruningMul())
......
......@@ -19,10 +19,10 @@ import unittest
import logging
import paddle
from paddleslim.common import get_logger
from paddleslim import PTQ
_logger = paddle.fluid.log_helper.get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
_logger = get_logger(__name__, level=logging.INFO)
class ImperativeLenet(paddle.nn.Layer):
......
......@@ -19,10 +19,10 @@ import unittest
import logging
import paddle
from paddleslim.common import get_logger
from paddleslim.dygraph.quant import QAT
_logger = paddle.fluid.log_helper.get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
_logger = get_logger(__name__, level=logging.INFO)
class ImperativeLenet(paddle.nn.Layer):
......
......@@ -113,7 +113,7 @@ class TestSensitivity(unittest.TestCase):
exe = paddle.static.Executor(place)
exe.run(startup_program)
val_reader = paddle.fluid.io.batch(self.val_reader, batch_size=128)
val_reader = paddle.batch(self.val_reader, batch_size=128)
def eval_func(program):
feeder = paddle.fluid.DataFeeder(
......
......@@ -35,12 +35,14 @@ class AnalysisQATDemo(unittest.TestCase):
super(AnalysisQATDemo, self).__init__(*args, **kwargs)
if not os.path.exists('MobileNetV1_infer'):
os.system(
'wget -q https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
'wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
)
os.system('tar -xf MobileNetV1_infer.tar')
if not os.path.exists('ILSVRC2012_data_demo'):
if not os.path.exists(
os.path.join('.', 'ILSVRC2012_data_demo', 'ILSVRC2012',
'train')):
os.system(
'wget -q https://sys-p0.bj.bcebos.com/slim_ci/ILSVRC2012_data_demo.tar.gz'
'wget https://sys-p0.bj.bcebos.com/slim_ci/ILSVRC2012_data_demo.tar.gz'
)
os.system('tar -xf ILSVRC2012_data_demo.tar.gz')
......
......@@ -93,7 +93,7 @@ class ModelCase4(paddle.nn.Layer):
x = paddle.stack([x, y], axis=3)
x = paddle.slice(x, axes=[0], starts=[0], ends=[1])
x = paddle.exp(x)
y += paddle.fluid.layers.uniform_random(y.shape)
y += paddle.uniform(y.shape)
y = paddle.mean(x=y, axis=1, keepdim=True)
return paddle.greater_equal(x, y)
......@@ -286,8 +286,8 @@ class TestCase2(unittest.TestCase):
pred = LatencyPredictor()
paddle.enable_static()
with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(fluid_program)
_program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(_program)
graph_keys = pred._get_key_info_from_graph(graph=graph)
assert len(graph_keys) > 0
......@@ -381,8 +381,8 @@ class TestCase6(unittest.TestCase):
paddle.enable_static()
with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(fluid_program)
_program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(_program)
graph_keys = predictor._get_key_info_from_graph(graph=graph)
assert len(graph_keys) > 0
......@@ -404,8 +404,8 @@ class TestCase7(unittest.TestCase):
paddle.enable_static()
with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(fluid_program)
_program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(_program)
graph_keys = predictor._get_key_info_from_graph(graph=graph)
assert len(graph_keys) > 0
......
......@@ -51,7 +51,7 @@ class TestPrune(StaticCase):
flag = paddle.full(shape=[1], fill_value=1, dtype='int32')
rand_flag = paddle.randint(2, dtype='int32')
cond = paddle.less_than(x=flag, y=rand_flag)
cond_output = paddle.fluid.layers.create_global_var(
cond_output = paddle.static.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
......@@ -355,7 +355,6 @@ class TestPruneWorker(unittest.TestCase):
cls = PRUNE_WORKER.get(self.op.type())
if cls is None:
cls = PRUNE_WORKER.get("default_worker")
# pruning input of conv op
for _var, _axis, _ret in self.cases:
pruned_params = []
......@@ -370,6 +369,7 @@ class TestPruneWorker(unittest.TestCase):
if var.name() not in ret:
ret[var.name()] = []
ret[var.name()].append(axis)
print(f"excepted: {_ret}; actual: {ret}")
self.assertTrue(ret == _ret)
......@@ -444,12 +444,6 @@ class TestActivation(TestPruneWorker):
act_suite = unittest.TestSuite()
act_suite.addTest(
TestActivation(
op=paddle.fluid.layers.resize_bilinear, scale=2.))
act_suite.addTest(
TestActivation(
op=paddle.fluid.layers.resize_nearest, scale=2.))
act_suite.addTest(TestActivation(op=paddle.floor))
act_suite.addTest(TestActivation(op=paddle.scale))
......@@ -774,8 +768,6 @@ class TestAverageAccumulates(TestPruneWorker):
out = paddle.mean(conv1)
opt = paddle.optimizer.Adam()
opt.minimize(out)
model_average = paddle.fluid.optimizer.ModelAverage(
0.15, min_average_window=10000, max_average_window=12500)
def set_cases(self):
weight_var = self.graph.var('conv1.w_0')
......@@ -783,9 +775,6 @@ class TestAverageAccumulates(TestPruneWorker):
'conv1.w_0': [0],
'conv1.w_0_moment1_0': [0],
'conv1.w_0_moment2_0': [0],
'conv1.w_0_sum_1_0': [0],
'conv1.w_0_sum_2_0': [0],
'conv1.w_0_sum_3_0': [0]
}))
def test_prune(self):
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
sys.path.append("../")
import unittest
import tempfile
......@@ -102,14 +103,12 @@ class ReconPTQ(unittest.TestCase):
format(iter, cost, top1, top5))
train(main_program)
paddle.fluid.io.save_inference_model(
dirname=self.tmpdir.name,
feeded_var_names=[image.name],
target_vars=[out],
main_program=val_program,
executor=exe,
model_filename='model.pdmodel',
params_filename='params.pdiparams')
paddle.static.save_inference_model(
os.path.join(self.tmpdir.name, "infer"),
feed_vars=[image],
fetch_vars=[out],
program=val_program,
executor=exe)
print(f"saved infer model to [{self.tmpdir.name}]")
self.data_loader = sample_generator_creator()
......@@ -130,8 +129,8 @@ class TestReconRegion(ReconPTQ):
self.tmpdir.name,
quantize_model_path='output_region',
sample_generator=self.data_loader,
model_filename='model.pdmodel',
params_filename='params.pdiparams',
model_filename='infer.pdmodel',
params_filename='infer.pdiparams',
batch_nums=1,
epochs=1,
algo='abs_max',
......@@ -154,8 +153,8 @@ class TestReconLayer(ReconPTQ):
self.tmpdir.name,
quantize_model_path='output_layer',
sample_generator=self.data_loader,
model_filename='model.pdmodel',
params_filename='params.pdiparams',
model_filename='infer.pdmodel',
params_filename='infer.pdiparams',
batch_nums=1,
epochs=1,
algo='KL',
......
......@@ -24,6 +24,9 @@ import numpy as np
class TestDartsSpace(StaticCase):
def __init__(self, methodNmae="test_search_space"):
super(TestDartsSpace, self).__init__(methodNmae)
def setUp(self):
paddle.enable_static()
self.init_test_case()
......@@ -89,6 +92,7 @@ search_space_suite.addTest(
search_space_suite.addTest(TestSearchSpace(search_sapce_name="ResNetSpace"))
search_space_suite.addTest(
TestSearchSpace(search_sapce_name="ResNetBlockSpace"))
search_space_suite.addTest(TestDartsSpace())
if __name__ == '__main__':
runner = unittest.TextTestRunner(verbosity=2)
......
......@@ -45,8 +45,7 @@ class TestSensitivity(StaticCase):
exe = paddle.static.Executor(place)
exe.run(startup_program)
val_reader = paddle.fluid.io.batch(
paddle.dataset.mnist.test(), batch_size=128)
val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
def eval_func(program):
feeder = paddle.fluid.DataFeeder(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册