提交 8331e835 编写于 作者: Y Yang Yu

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_CudnnHolder_bug

...@@ -213,9 +213,11 @@ include(configure) # add paddle env configuration ...@@ -213,9 +213,11 @@ include(configure) # add paddle env configuration
if(WITH_GPU) if(WITH_GPU)
include(cuda) include(cuda)
include(tensorrt) include(tensorrt)
endif()
if(WITH_MKL OR WITH_MKLML)
include(external/anakin) include(external/anakin)
elseif() elseif()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE) set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
endif() endif()
include(generic) # simplify cmake module include(generic) # simplify cmake module
......
...@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s ...@@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ... # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle RUN pip install -U kubernetes paddlepaddle
...@@ -27,5 +28,6 @@ ADD *.whl / ...@@ -27,5 +28,6 @@ ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/ ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
ADD models/ /workspace/models/ ADD models/ /workspace/models/
...@@ -17,7 +17,8 @@ import argparse ...@@ -17,7 +17,8 @@ import argparse
__all__ = ['parse_args', ] __all__ = ['parse_args', ]
BENCHMARK_MODELS = [ BENCHMARK_MODELS = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
"stacked_dynamic_lstm", "resnet_with_preprocess"
] ]
...@@ -67,12 +68,12 @@ def parse_args(): ...@@ -67,12 +68,12 @@ def parse_args():
'--cpus', '--cpus',
type=int, type=int,
default=1, default=1,
help='If cpus > 1, will use ParallelDo to run, else use Executor.') help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
parser.add_argument( parser.add_argument(
'--data_set', '--data_set',
type=str, type=str,
default='flowers', default='flowers',
choices=['cifar10', 'flowers'], choices=['cifar10', 'flowers', 'imagenet'],
help='Optional dataset for benchmark.') help='Optional dataset for benchmark.')
parser.add_argument( parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.') '--infer_only', action='store_true', help='If set, run forward only.')
...@@ -122,6 +123,11 @@ def parse_args(): ...@@ -122,6 +123,11 @@ def parse_args():
type=str, type=str,
default="", default="",
help='Directory that contains all the training recordio files.') help='Directory that contains all the training recordio files.')
parser.add_argument(
'--test_data_path',
type=str,
default="",
help='Directory that contains all the test data (NOT recordio).')
parser.add_argument( parser.add_argument(
'--use_inference_transpiler', '--use_inference_transpiler',
action='store_true', action='store_true',
...@@ -130,5 +136,9 @@ def parse_args(): ...@@ -130,5 +136,9 @@ def parse_args():
'--no_random', '--no_random',
action='store_true', action='store_true',
help='If set, keep the random seed and do not shuffle the data.') help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -16,6 +16,7 @@ import argparse ...@@ -16,6 +16,7 @@ import argparse
import cProfile import cProfile
import time import time
import os import os
import traceback
import numpy as np import numpy as np
...@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler ...@@ -27,7 +28,7 @@ import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
from args import * from args import *
def append_nccl2_prepare(trainer_id): def append_nccl2_prepare(trainer_id, startup_prog):
if trainer_id >= 0: if trainer_id >= 0:
# append gen_nccl_id at the end of startup program # append gen_nccl_id at the end of startup program
trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
...@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id): ...@@ -40,11 +41,11 @@ def append_nccl2_prepare(trainer_id):
current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
worker_endpoints.remove(current_endpoint) worker_endpoints.remove(current_endpoint)
nccl_id_var = fluid.default_startup_program().global_block().create_var( nccl_id_var = startup_prog.global_block().create_var(
name="NCCLID", name="NCCLID",
persistable=True, persistable=True,
type=fluid.core.VarDesc.VarType.RAW) type=fluid.core.VarDesc.VarType.RAW)
fluid.default_startup_program().global_block().append_op( startup_prog.global_block().append_op(
type="gen_nccl_id", type="gen_nccl_id",
inputs={}, inputs={},
outputs={"NCCLID": nccl_id_var}, outputs={"NCCLID": nccl_id_var},
...@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id): ...@@ -59,7 +60,7 @@ def append_nccl2_prepare(trainer_id):
"nccl-based dist train.") "nccl-based dist train.")
def dist_transpile(trainer_id, args): def dist_transpile(trainer_id, args, train_prog, startup_prog):
if trainer_id < 0: if trainer_id < 0:
return None, None return None, None
...@@ -80,133 +81,69 @@ def dist_transpile(trainer_id, args): ...@@ -80,133 +81,69 @@ def dist_transpile(trainer_id, args):
# the role, should be either PSERVER or TRAINER # the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE") training_role = os.getenv("PADDLE_TRAINING_ROLE")
t = distribute_transpiler.DistributeTranspiler() config = distribute_transpiler.DistributeTranspilerConfig()
config.slice_var_up = not args.no_split_var
t = distribute_transpiler.DistributeTranspiler(config=config)
t.transpile( t.transpile(
trainer_id, trainer_id,
# NOTE: *MUST* use train_prog, for we are using with guard to
# generate different program for train and test.
program=train_prog,
pservers=pserver_endpoints, pservers=pserver_endpoints,
trainers=trainers, trainers=trainers,
sync_mode=not args.async_mode) sync_mode=not args.async_mode)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint) pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program(current_endpoint, pserver_startup_program = t.get_startup_program(
pserver_program) current_endpoint, pserver_program, startup_program=startup_prog)
return pserver_program, pserver_startup_program return pserver_program, pserver_startup_program
elif training_role == "TRAINER": elif training_role == "TRAINER":
train_program = t.get_trainer_program() train_program = t.get_trainer_program()
return train_program, fluid.default_startup_program() return train_program, startup_prog
else: else:
raise ValueError( raise ValueError(
'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
) )
def test(exe, inference_program, test_reader, feeder, batch_acc): def test_parallel(exe, test_args, args, test_prog, feeder):
accuracy_evaluator = fluid.metrics.Accuracy() acc_evaluators = []
for batch_id, data in enumerate(test_reader()): for i in xrange(len(test_args[2])):
acc = exe.run(inference_program, acc_evaluators.append(fluid.metrics.Accuracy())
feed=feeder.feed(data),
fetch_list=[batch_acc])
accuracy_evaluator.update(value=np.array(acc), weight=len(data))
return accuracy_evaluator.eval() to_fetch = [v.name for v in test_args[2]]
if args.use_reader_op:
test_args[4].start()
# TODO(wuyi): replace train, train_parallel, test functions with new trainer
# API once it is ready.
def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
args, train_prog, startup_prog):
if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
place = core.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(train_prog)
return
if args.use_fake_data:
raise Exception(
"fake data is not supported in single GPU test for now.")
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_prog)
# Use inference_transpiler to speedup
if not args.use_reader_op:
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num):
train_losses = []
if not args.use_reader_op:
reader_generator = train_reader()
batch_id = 0
data = None
while True: while True:
if not args.use_reader_op: try:
data = next(reader_generator, None) acc_rets = exe.run(fetch_list=to_fetch)
if data == None: for i, e in enumerate(acc_evaluators):
break e.update(
if iters == args.iterations: value=np.array(acc_rets[i]), weight=args.batch_size)
reader_generator.close() except fluid.core.EOFException as eof:
test_args[4].reset()
break break
if iters == args.skip_batch_num: else:
start_time = time.time() for batch_id, data in enumerate(test_args[3]()):
num_samples = 0 acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
for i, e in enumerate(acc_evaluators):
e.update(value=np.array(acc_rets[i]), weight=len(data))
if args.use_reader_op: return [e.eval() for e in acc_evaluators]
try:
loss = exe.run(train_prog, fetch_list=[avg_loss])
except fluid.core.EnforceNotMet as ex:
break
else:
loss = exe.run(train_prog,
feed=feeder.feed(data),
fetch_list=[avg_loss])
iters += 1
batch_id += 1
# FIXME(wuyi): For use_reader_op, if the current
# pass is not the last, the last batch of this pass
# is also equal to args.batch_size.
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
num_samples += len(data)
train_losses.append(loss)
print("Pass: %d, Iter: %d, Loss: %f\n" %
(pass_id, iters, np.mean(train_losses)))
print_train_time(start_time, time.time(), num_samples)
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
# evaluation
if not args.no_test and batch_acc and not args.use_reader_op:
if args.use_inference_transpiler:
t = fluid.InferenceTranspiler()
t.transpile(infer_prog, place)
pass_test_acc = test(exe, infer_prog, test_reader, feeder,
batch_acc)
print(", Test Accuracy: %f" % pass_test_acc)
print("\n")
# TODO(wuyi): add warmup passes to get better perf data.
exit(0)
# TODO(wuyi): replace train, train_parallel, test functions with new trainer # NOTE: only need to benchmark using parallelexe
# API once it is ready. def train_parallel(train_args, test_args, args, train_prog, test_prog,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, startup_prog, nccl_id_var, num_trainers, trainer_id):
batch_acc, args, train_prog, startup_prog, nccl_id_var, over_all_start = time.time()
num_trainers, trainer_id):
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
feeder = None
if not args.use_reader_op: if not args.use_reader_op:
feed_var_list = [ feed_var_list = [
var for var in train_prog.global_block().vars.itervalues() var for var in train_prog.global_block().vars.itervalues()
if var.is_data if var.is_data
] ]
feeder = fluid.DataFeeder(feed_var_list, place) feeder = fluid.DataFeeder(feed_var_list, place)
# generate fake: # generate fake:
if args.use_fake_data: if args.use_fake_data:
for var in feed_var_list: for var in feed_var_list:
...@@ -230,63 +167,110 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -230,63 +167,110 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog) startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy() strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1 strategy.num_threads = args.cpus
strategy.allow_op_delay = False strategy.allow_op_delay = False
avg_loss = train_args[0]
if args.update_method == "pserver":
# parameter server mode distributed training, merge
# gradients on local server, do not initialize
# ParallelExecutor with multi server all-reduce mode.
num_trainers = 1
trainer_id = 0
exe = fluid.ParallelExecutor( exe = fluid.ParallelExecutor(
True, True,
avg_loss.name, avg_loss.name,
main_program=train_prog,
exec_strategy=strategy, exec_strategy=strategy,
num_trainers=num_trainers, num_trainers=num_trainers,
trainer_id=trainer_id) trainer_id=trainer_id)
if not args.no_test:
if args.update_method == "pserver":
test_scope = None
else:
# NOTE: use an empty scope to avoid test exe using NCCLID
test_scope = fluid.Scope()
test_exe = fluid.ParallelExecutor(
True, main_program=test_prog, share_vars_from=exe)
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
num_samples = 0 num_samples = 0
iters = 0 iters = 0
start_time = time.time() start_time = time.time()
if not args.use_reader_op: if not args.use_reader_op:
reader_generator = train_reader() reader_generator = train_args[3]() #train_reader
batch_id = 0 batch_id = 0
data = None data = None
if args.use_reader_op:
train_args[4].start()
while True: while True:
if not args.use_reader_op: if not args.use_reader_op:
data = next(reader_generator, None) data = next(reader_generator, None)
if data == None: if data == None:
break break
if args.profile and batch_id == 5:
profiler.start_profiler("All")
profiler.reset_profiler()
elif args.profile and batch_id == 10:
print("profiling total time: ", time.time() - start_time)
profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
(trainer_id, pass_id))
if iters == args.iterations: if iters == args.iterations:
reader_generator.close() reader_generator.close()
break break
if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif args.profile and pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
if iters == args.skip_batch_num: if iters == args.skip_batch_num:
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
fetch_list = [avg_loss.name]
acc_name_list = [v.name for v in train_args[2]]
fetch_list.extend(acc_name_list)
if args.use_fake_data or args.use_reader_op: if args.use_fake_data or args.use_reader_op:
try: try:
loss, = exe.run([avg_loss.name])
fetch_ret = exe.run(fetch_list)
except fluid.core.EOFException as eof:
break
except fluid.core.EnforceNotMet as ex: except fluid.core.EnforceNotMet as ex:
traceback.print_exc()
break break
else: else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
if args.use_reader_op: if args.use_reader_op:
num_samples += args.batch_size * args.gpus num_samples += args.batch_size * args.gpus
else: else:
num_samples += len(data) num_samples += len(data)
iters += 1 iters += 1
if batch_id % 1 == 0: if batch_id % 1 == 0:
print("Pass %d, batch %d, loss %s" % fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
(pass_id, batch_id, np.array(loss))) print("Pass %d, batch %d, loss %s, accucacys: %s" %
(pass_id, batch_id, fetched_data[0], fetched_data[1:]))
batch_id += 1 batch_id += 1
print_train_time(start_time, time.time(), num_samples) print_train_time(start_time, time.time(), num_samples)
if not args.no_test and batch_acc and not args.use_reader_op: if args.use_reader_op:
# we have not implement record io for test train_args[4].reset() # reset reader handle
# skip test when use args.use_reader_op else:
test_acc = test(startup_exe, infer_prog, test_reader, feeder, del reader_generator
batch_acc)
print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) if not args.no_test and test_args[2]:
test_feeder = None
if not args.use_reader_op:
test_feed_var_list = [
var for var in test_prog.global_block().vars.itervalues()
if var.is_data
]
test_feeder = fluid.DataFeeder(test_feed_var_list, place)
test_ret = test_parallel(test_exe, test_args, args, test_prog,
test_feeder)
print("Pass: %d, Test Accuracy: %s\n" %
(pass_id, [np.mean(np.array(v)) for v in test_ret]))
print("total train time: ", time.time() - over_all_start)
def print_arguments(args): def print_arguments(args):
...@@ -328,44 +312,46 @@ def main(): ...@@ -328,44 +312,46 @@ def main():
if args.use_cprof: if args.use_cprof:
pr = cProfile.Profile() pr = cProfile.Profile()
pr.enable() pr.enable()
model_def = __import__("models.%s" % args.model, fromlist=["models"]) model_def = __import__("models.%s" % args.model, fromlist=["models"])
train_args = list(model_def.get_model(args))
train_args.append(args) train_prog = fluid.Program()
# Run optimizer.minimize(avg_loss) test_prog = fluid.Program()
train_args[2].minimize(train_args[0]) startup_prog = fluid.Program()
if args.memory_optimize:
fluid.memory_optimize(fluid.default_main_program()) train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
all_args = [train_args, test_args, args]
if args.update_method == "pserver": if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile(trainer_id, args) train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
startup_prog)
if not train_prog: if not train_prog:
raise Exception( raise Exception(
"Must configure correct environments to run dist train.") "Must configure correct environments to run dist train.")
train_args.extend([train_prog, startup_prog]) all_args.extend([train_prog, test_prog, startup_prog])
if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER": if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
train_args.extend([nccl_id_var, num_trainers, trainer_id]) all_args.extend([nccl_id_var, num_trainers, trainer_id])
train_parallel(*train_args) train_parallel(*all_args)
train(*train_args) elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
# start pserver with Executor
server_exe = fluid.Executor(fluid.CPUPlace())
server_exe.run(startup_prog)
server_exe.run(train_prog)
exit(0) exit(0)
# for other update methods, use default programs # for other update methods, use default programs
train_args.append(fluid.default_main_program()) all_args.extend([train_prog, test_prog, startup_prog])
train_args.append(fluid.default_startup_program())
if args.update_method == "nccl2": if args.update_method == "nccl2":
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id) nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
if args.gpus == 1: trainer_id, startup_prog)
# NOTE: parallel executor use profiler interanlly
if args.use_nvprof and args.device == 'GPU': if args.device == "CPU":
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: raise Exception("Only support GPU perf with parallel exe")
train(*train_args) all_args.extend([nccl_id_var, num_trainers, trainer_id])
else: train_parallel(*all_args)
train(*train_args)
else:
if args.device == "CPU":
raise Exception("Only support GPU perf with parallel exe")
train_args.extend([nccl_id_var, num_trainers, trainer_id])
train_parallel(*train_args)
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import math
import random
import functools
import numpy as np
from threading import Thread
import subprocess
import time
from Queue import Queue
import paddle
from PIL import Image, ImageEnhance
random.seed(0)
DATA_DIM = 224
THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
BUF_SIZE = 5120
DATA_DIR = '/mnt/ImageNet'
TRAIN_LIST = '/mnt/ImageNet/train.txt'
TEST_LIST = '/mnt/ImageNet/val.txt'
img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def crop_image(img, target_size, center):
width, height = img.size
size = target_size
if center == True:
w_start = (width - size) / 2
h_start = (height - size) / 2
else:
w_start = random.randint(0, width - size)
h_start = random.randint(0, height - size)
w_end = w_start + size
h_end = h_start + size
img = img.crop((w_start, h_start, w_end, h_end))
return img
def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
aspect_ratio = math.sqrt(random.uniform(*ratio))
w = 1. * aspect_ratio
h = 1. / aspect_ratio
bound = min((float(img.size[0]) / img.size[1]) / (w**2),
(float(img.size[1]) / img.size[0]) / (h**2))
scale_max = min(scale[1], bound)
scale_min = min(scale[0], bound)
target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
scale_max)
target_size = math.sqrt(target_area)
w = int(target_size * w)
h = int(target_size * h)
i = random.randint(0, img.size[0] - w)
j = random.randint(0, img.size[1] - h)
img = img.crop((i, j, i + w, j + h))
img = img.resize((size, size), Image.LANCZOS)
return img
def rotate_image(img):
angle = random.randint(-10, 10)
img = img.rotate(angle)
return img
def distort_color(img):
def random_brightness(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Brightness(img).enhance(e)
def random_contrast(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Contrast(img).enhance(e)
def random_color(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Color(img).enhance(e)
ops = [random_brightness, random_contrast, random_color]
random.shuffle(ops)
img = ops[0](img)
img = ops[1](img)
img = ops[2](img)
return img
def process_image(sample, mode, color_jitter, rotate):
img_path = sample[0]
img = Image.open(img_path)
if mode == 'train':
if rotate: img = rotate_image(img)
img = random_crop(img, DATA_DIM)
else:
img = resize_short(img, target_size=256)
img = crop_image(img, target_size=DATA_DIM, center=True)
if mode == 'train':
if color_jitter:
img = distort_color(img)
if random.randint(0, 1) == 1:
img = img.transpose(Image.FLIP_LEFT_RIGHT)
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
img -= img_mean
img /= img_std
if mode == 'train' or mode == 'val':
return img, sample[1]
elif mode == 'test':
return [img]
class XmapEndSignal():
pass
def xmap_readers(mapper,
reader,
process_num,
buffer_size,
order=False,
print_queue_state=True):
end = XmapEndSignal()
# define a worker to read samples from reader to in_queue
def read_worker(reader, in_queue):
for i in reader():
in_queue.put(i)
in_queue.put(end)
# define a worker to read samples from reader to in_queue with order flag
def order_read_worker(reader, in_queue, file_queue):
in_order = 0
for i in reader():
in_queue.put((in_order, i))
in_order += 1
in_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue
def handle_worker(in_queue, out_queue, mapper):
sample = in_queue.get()
while not isinstance(sample, XmapEndSignal):
r = mapper(sample)
out_queue.put(r)
sample = in_queue.get()
in_queue.put(end)
out_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue by order
def order_handle_worker(in_queue, out_queue, mapper, out_order):
ins = in_queue.get()
while not isinstance(ins, XmapEndSignal):
order, sample = ins
r = mapper(sample)
while order != out_order[0]:
pass
out_queue.put(r)
out_order[0] += 1
ins = in_queue.get()
in_queue.put(end)
out_queue.put(end)
def xreader():
file_queue = Queue()
in_queue = Queue(buffer_size)
out_queue = Queue(buffer_size)
out_order = [0]
# start a read worker in a thread
target = order_read_worker if order else read_worker
t = Thread(target=target, args=(reader, in_queue))
t.daemon = True
t.start()
# start several handle_workers
target = order_handle_worker if order else handle_worker
args = (in_queue, out_queue, mapper, out_order) if order else (
in_queue, out_queue, mapper)
workers = []
for i in xrange(process_num):
worker = Thread(target=target, args=args)
worker.daemon = True
workers.append(worker)
for w in workers:
w.start()
sample = out_queue.get()
start_t = time.time()
while not isinstance(sample, XmapEndSignal):
yield sample
sample = out_queue.get()
if time.time() - start_t > 3:
if print_queue_state:
print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
start_t = time.time()
finish = 1
while finish < process_num:
sample = out_queue.get()
if isinstance(sample, XmapEndSignal):
finish += 1
else:
yield sample
return xreader
def _reader_creator(file_list,
mode,
shuffle=False,
color_jitter=False,
rotate=False,
xmap=True):
def reader():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if shuffle:
random.shuffle(full_lines)
if mode == 'train':
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
per_node_lines = len(full_lines) / trainer_count
lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
* per_node_lines]
print(
"read images from %d, length: %d, lines length: %d, total: %d"
% (trainer_id * per_node_lines, per_node_lines, len(lines),
len(full_lines)))
else:
lines = full_lines
for line in lines:
if mode == 'train':
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "train", img_path)
yield (img_path, int(label))
elif mode == 'val':
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "val", img_path)
yield (img_path, int(label))
elif mode == 'test':
img_path = os.path.join(DATA_DIR, line)
yield [img_path]
mapper = functools.partial(
process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
def load_raw_image_uint8(sample):
img_arr = np.array(Image.open(sample[0])).astype('int64')
return img_arr, int(sample[1])
def train_raw(file_list=TRAIN_LIST, shuffle=True):
def reader():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if shuffle:
random.shuffle(full_lines)
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
per_node_lines = len(full_lines) / trainer_count
lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
per_node_lines]
print("read images from %d, length: %d, lines length: %d, total: %d"
% (trainer_id * per_node_lines, per_node_lines, len(lines),
len(full_lines)))
for line in lines:
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "train", img_path)
yield (img_path, int(label))
return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
BUF_SIZE)
def train(file_list=TRAIN_LIST, xmap=True):
return _reader_creator(
file_list,
'train',
shuffle=True,
color_jitter=False,
rotate=False,
xmap=xmap)
def val(file_list=TEST_LIST, xmap=True):
return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
def test(file_list=TEST_LIST):
return _reader_creator(file_list, 'test', shuffle=False)
if __name__ == "__main__":
c = 0
start_t = time.time()
for d in train()():
c += 1
if c >= 10000:
break
spent = time.time() - start_t
print("read 10000 speed: ", 10000 / spent, spent)
...@@ -163,6 +163,19 @@ def gen_job(): ...@@ -163,6 +163,19 @@ def gen_job():
volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}}) volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"}) volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
# add ceph volumes
volumes.append({
"name": "ceph-data",
"cephfs": {
"monitors": ["192.168.16.23:6789"],
"secretRef": {
"name": "ceph-secret"
},
"user": "admin",
}
})
volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
tn["spec"]["template"]["spec"]["volumes"] = volumes tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts tn_container["volumeMounts"] = volumeMounts
......
...@@ -13,5 +13,6 @@ ...@@ -13,5 +13,6 @@
# limitations under the License. # limitations under the License.
__all__ = [ __all__ = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
"resnet_with_preprocess"
] ]
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""seq2seq model for fluid.""" """seq2seq model for fluid."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
...@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor): ...@@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
return ndarray return ndarray
def get_model(args): def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op: if args.use_reader_op:
raise Exception("machine_translation do not support reader op for now.") raise Exception("machine_translation do not support reader op for now.")
embedding_dim = 512 embedding_dim = 512
...@@ -190,30 +191,27 @@ def get_model(args): ...@@ -190,30 +191,27 @@ def get_model(args):
dict_size = 30000 dict_size = 30000
beam_size = 3 beam_size = 3
max_length = 250 max_length = 250
avg_cost, feeding_list = seq_to_seq_net(
embedding_dim,
encoder_size,
decoder_size,
dict_size,
dict_size,
False,
beam_size=beam_size,
max_length=max_length)
# clone from default main program
inference_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=args.batch_size * args.gpus)
test_batch_generator = paddle.batch( with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
avg_cost, feeding_list = seq_to_seq_net(
embedding_dim,
encoder_size,
decoder_size,
dict_size,
dict_size,
False,
beam_size=beam_size,
max_length=max_length)
if is_train:
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
batch_generator = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.test(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size)
batch_size=args.batch_size) if is_train else paddle.dataset.wmt14.test(dict_size),
buf_size=1000),
batch_size=args.batch_size * args.gpus)
return avg_cost, inference_program, optimizer, train_batch_generator, \ return avg_cost, optimizer, [], batch_generator, None
test_batch_generator, None
...@@ -65,61 +65,50 @@ def cnn_model(data): ...@@ -65,61 +65,50 @@ def cnn_model(data):
return predict return predict
def get_model(args): def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op: # NOTE: mnist is small, we don't implement data sharding yet.
filelist = [ filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path) os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
] ]
data_file = fluid.layers.open_files( with fluid.program_guard(main_prog, startup_prog):
filenames=filelist, if args.use_reader_op:
shapes=[[-1, 1, 28, 28], (-1, 1)], data_file_handle = fluid.layers.open_files(
lod_levels=[0, 0], filenames=filelist,
dtypes=["float32", "int64"], shapes=[[-1, 1, 28, 28], (-1, 1)],
thread_num=args.gpus, lod_levels=[0, 0],
pass_num=args.pass_num) dtypes=["float32", "int64"],
data_file = fluid.layers.double_buffer( thread_num=1,
fluid.layers.batch( pass_num=1)
data_file, batch_size=args.batch_size)) data_file = fluid.layers.double_buffer(
images, label = fluid.layers.read_file(data_file) fluid.layers.batch(
else: data_file_handle, batch_size=args.batch_size))
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) with fluid.unique_name.guard():
label = fluid.layers.data(name='label', shape=[1], dtype='int64') if args.use_reader_op:
input, label = fluid.layers.read_file(data_file)
if args.device == 'CPU' and args.cpus > 1: else:
places = fluid.layers.get_places(args.cpus) images = fluid.layers.data(
pd = fluid.layers.ParallelDo(places) name='pixel', shape=[1, 28, 28], dtype='float32')
with pd.do(): label = fluid.layers.data(
predict = cnn_model(pd.read_input(images)) name='label', shape=[1], dtype='int64')
label = pd.read_input(label)
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_acc = fluid.layers.accuracy(input=predict, label=label) batch_acc = fluid.layers.accuracy(input=predict, label=label)
# Optimization
pd.write_output(avg_cost) if is_train:
pd.write_output(batch_acc) opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
avg_cost, batch_acc = pd() opt.minimize()
avg_cost = fluid.layers.mean(avg_cost) if args.memory_optimize:
batch_acc = fluid.layers.mean(batch_acc) fluid.memory_optimize(main_prog)
else:
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_acc = fluid.layers.accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# Reader # Reader
train_reader = paddle.batch( if is_train:
paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus) reader = paddle.dataset.mnist.train()
test_reader = paddle.batch( else:
paddle.dataset.mnist.test(), batch_size=args.batch_size) reader = paddle.dataset.mnist.test()
return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc batched_reader = paddle.batch(
reader, batch_size=args.batch_size * args.gpus)
return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
...@@ -27,10 +27,17 @@ import paddle ...@@ -27,10 +27,17 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
from recordio_converter import imagenet_train, imagenet_test # from recordio_converter import imagenet_train, imagenet_test
from imagenet_reader import train, val
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
is_train=True):
conv1 = fluid.layers.conv2d( conv1 = fluid.layers.conv2d(
input=input, input=input,
filter_size=filter_size, filter_size=filter_size,
...@@ -39,29 +46,31 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): ...@@ -39,29 +46,31 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
padding=padding, padding=padding,
act=None, act=None,
bias_attr=False) bias_attr=False)
return fluid.layers.batch_norm(input=conv1, act=act) return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
def shortcut(input, ch_out, stride): def shortcut(input, ch_out, stride, is_train=True):
ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1]
if ch_in != ch_out: if ch_in != ch_out:
return conv_bn_layer(input, ch_out, 1, stride, 0, None) return conv_bn_layer(
input, ch_out, 1, stride, 0, None, is_train=is_train)
else: else:
return input return input
def basicblock(input, ch_out, stride): def basicblock(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out, stride) short = shortcut(input, ch_out, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def bottleneck(input, ch_out, stride): def bottleneck(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out * 4, stride) short = shortcut(input, ch_out * 4, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) conv3 = conv_bn_layer(
conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
...@@ -72,7 +81,11 @@ def layer_warp(block_func, input, ch_out, count, stride): ...@@ -72,7 +81,11 @@ def layer_warp(block_func, input, ch_out, count, stride):
return res_out return res_out
def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'): def resnet_imagenet(input,
class_dim,
depth=50,
data_format='NCHW',
is_train=True):
cfg = { cfg = {
18: ([2, 2, 2, 1], basicblock), 18: ([2, 2, 2, 1], basicblock),
...@@ -115,8 +128,9 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): ...@@ -115,8 +128,9 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
return out return out
def get_model(args): def _model_reader_dshape_classdim(args, is_train):
model = resnet_cifar10 model = resnet_cifar10
reader = None
if args.data_set == "cifar10": if args.data_set == "cifar10":
class_dim = 10 class_dim = 10
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
...@@ -124,8 +138,10 @@ def get_model(args): ...@@ -124,8 +138,10 @@ def get_model(args):
else: else:
dshape = [32, 32, 3] dshape = [32, 32, 3]
model = resnet_cifar10 model = resnet_cifar10
train_reader = paddle.dataset.cifar.train10() if is_train:
test_reader = paddle.dataset.cifar.test10() reader = paddle.dataset.cifar.train10()
else:
reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers": elif args.data_set == "flowers":
class_dim = 102 class_dim = 102
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
...@@ -133,8 +149,10 @@ def get_model(args): ...@@ -133,8 +149,10 @@ def get_model(args):
else: else:
dshape = [224, 224, 3] dshape = [224, 224, 3]
model = resnet_imagenet model = resnet_imagenet
train_reader = paddle.dataset.flowers.train() if is_train:
test_reader = paddle.dataset.flowers.test() reader = paddle.dataset.flowers.train()
else:
reader = paddle.dataset.flowers.test()
elif args.data_set == "imagenet": elif args.data_set == "imagenet":
class_dim = 1000 class_dim = 1000
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
...@@ -145,64 +163,89 @@ def get_model(args): ...@@ -145,64 +163,89 @@ def get_model(args):
if not args.data_path: if not args.data_path:
raise Exception( raise Exception(
"Must specify --data_path when training with imagenet") "Must specify --data_path when training with imagenet")
train_reader = imagenet_train(args.data_path) if not args.use_reader_op:
test_reader = imagenet_test(args.data_path) if is_train:
reader = train()
if args.use_reader_op: else:
filelist = [ reader = val()
os.path.join(args.data_path, f) for f in os.listdir(args.data_path) else:
] if is_train:
data_file = fluid.layers.open_files( reader = train(xmap=False)
filenames=filelist, else:
shapes=[[-1] + dshape, (-1, 1)], reader = val(xmap=False)
lod_levels=[0, 0], return model, reader, dshape, class_dim
dtypes=["float32", "int64"],
thread_num=args.gpus,
pass_num=args.pass_num) def get_model(args, is_train, main_prog, startup_prog):
data_file = fluid.layers.double_buffer( model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
fluid.layers.batch( is_train)
data_file, batch_size=args.batch_size))
input, label = fluid.layers.read_file(data_file) pyreader = None
else: trainer_count = int(os.getenv("PADDLE_TRAINERS"))
input = fluid.layers.data(name='data', shape=dshape, dtype='float32') with fluid.program_guard(main_prog, startup_prog):
label = fluid.layers.data(name='label', shape=[1], dtype='int64') with fluid.unique_name.guard():
if args.use_reader_op:
if args.device == 'CPU' and args.cpus > 1: pyreader = fluid.layers.py_reader(
places = fluid.layers.get_places(args.cpus) capacity=args.batch_size * args.gpus,
pd = fluid.layers.ParallelDo(places) shapes=([-1] + dshape, (-1, 1)),
with pd.do(): dtypes=('float32', 'int64'),
predict = model(pd.read_input(input), class_dim) name="train_reader" if is_train else "test_reader",
label = pd.read_input(label) use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader)
else:
input = fluid.layers.data(
name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
predict = model(input, class_dim, is_train=is_train)
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
batch_acc = fluid.layers.accuracy(input=predict, label=label)
pd.write_output(avg_cost)
pd.write_output(batch_acc)
avg_cost, batch_acc = pd() batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
avg_cost = fluid.layers.mean(avg_cost) batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
batch_acc = fluid.layers.mean(batch_acc)
# configure optimize
optimizer = None
if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
epochs = [30, 60, 80, 90]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=base_lr,
#learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# config readers
if not args.use_reader_op:
batched_reader = paddle.batch(
reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size * args.gpus,
drop_last=True)
else: else:
predict = model(input, class_dim) batched_reader = None
cost = fluid.layers.cross_entropy(input=predict, label=label) pyreader.decorate_paddle_reader(
avg_cost = fluid.layers.mean(x=cost) paddle.batch(
batch_acc = fluid.layers.accuracy(input=predict, label=label) reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
inference_program = fluid.default_main_program().clone() batch_size=args.batch_size))
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program( return avg_cost, optimizer, [batch_acc1,
target_vars=[batch_acc]) batch_acc5], batched_reader, pyreader
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
batched_train_reader = paddle.batch(
train_reader if args.no_random else paddle.reader.shuffle(
train_reader, buf_size=5120),
batch_size=args.batch_size * args.gpus,
drop_last=True)
batched_test_reader = paddle.batch(
test_reader, batch_size=args.batch_size, drop_last=True)
return avg_cost, inference_program, optimizer, batched_train_reader,\
batched_test_reader, batch_acc
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import time
import os
import cProfile, pstats, StringIO
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
# from recordio_converter import imagenet_train, imagenet_test
from imagenet_reader import train_raw, val
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
is_train=True):
conv1 = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
def shortcut(input, ch_out, stride, is_train=True):
ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1]
if ch_in != ch_out:
return conv_bn_layer(
input, ch_out, 1, stride, 0, None, is_train=is_train)
else:
return input
def basicblock(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def bottleneck(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out * 4, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
conv3 = conv_bn_layer(
conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
def layer_warp(block_func, input, ch_out, count, stride):
res_out = block_func(input, ch_out, stride)
for i in range(1, count):
res_out = block_func(res_out, ch_out, 1)
return res_out
def resnet_imagenet(input,
class_dim,
depth=50,
data_format='NCHW',
is_train=True):
cfg = {
18: ([2, 2, 2, 1], basicblock),
34: ([3, 4, 6, 3], basicblock),
50: ([3, 4, 6, 3], bottleneck),
101: ([3, 4, 23, 3], bottleneck),
152: ([3, 8, 36, 3], bottleneck)
}
stages, block_func = cfg[depth]
conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
pool1 = fluid.layers.pool2d(
input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
res2 = layer_warp(block_func, res1, 128, stages[1], 2)
res3 = layer_warp(block_func, res2, 256, stages[2], 2)
res4 = layer_warp(block_func, res3, 512, stages[3], 2)
pool2 = fluid.layers.pool2d(
input=res4,
pool_size=7,
pool_type='avg',
pool_stride=1,
global_pooling=True)
out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
return out
def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
assert (depth - 2) % 6 == 0
n = (depth - 2) // 6
conv1 = conv_bn_layer(
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
return out
def _model_reader_dshape_classdim(args, is_train):
model = resnet_cifar10
reader = None
if args.data_set == "cifar10":
class_dim = 10
if args.data_format == 'NCHW':
dshape = [3, 32, 32]
else:
dshape = [32, 32, 3]
model = resnet_cifar10
if is_train:
reader = paddle.dataset.cifar.train10()
else:
reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers":
class_dim = 102
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if is_train:
reader = paddle.dataset.flowers.train()
else:
reader = paddle.dataset.flowers.test()
elif args.data_set == "imagenet":
class_dim = 1000
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if not args.data_path:
raise Exception(
"Must specify --data_path when training with imagenet")
if not args.use_reader_op:
if is_train:
reader = train_raw()
else:
reader = val()
else:
if is_train:
reader = train_raw()
else:
reader = val(xmap=False)
return model, reader, dshape, class_dim
def get_model(args, is_train, main_prog, startup_prog):
model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
is_train)
pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
if args.use_reader_op:
pyreader = fluid.layers.py_reader(
capacity=args.batch_size * args.gpus,
shapes=([-1] + dshape, (-1, 1)),
dtypes=('uint8', 'int64'),
name="train_reader" if is_train else "test_reader",
use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader)
else:
input = fluid.layers.data(
name='data', shape=dshape, dtype='uint8')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
# add imagenet preprocessors
random_crop = fluid.layers.random_crop(input, dshape)
casted = fluid.layers.cast(random_crop, 'float32')
# input is HWC
trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
img_mean = fluid.layers.tensor.assign(
np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
1)))
img_std = fluid.layers.tensor.assign(
np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
1)))
h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
# pre_out = (trans - img_mean) / img_std
predict = model(h2, class_dim, is_train=is_train)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
# configure optimize
optimizer = None
if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
epochs = [30, 60, 80, 90]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=base_lr,
#learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# config readers
if not args.use_reader_op:
batched_reader = paddle.batch(
reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size * args.gpus,
drop_last=True)
else:
batched_reader = None
pyreader.decorate_paddle_reader(
paddle.batch(
# reader if args.no_random else paddle.reader.shuffle(
# reader, buf_size=5120),
reader,
batch_size=args.batch_size))
return avg_cost, optimizer, [batch_acc1,
batch_acc5], batched_reader, pyreader
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import math
import os
from imagenet_reader import train, val
__all__ = [
"SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
"SE_ResNeXt152_32x4d", "get_model"
]
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class SE_ResNeXt():
def __init__(self, layers=50, is_train=True):
self.params = train_parameters
self.layers = layers
self.is_train = is_train
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 101:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 23, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 152:
cardinality = 64
reduction_ratio = 16
depth = [3, 8, 36, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=3,
stride=2,
act='relu')
conv = self.conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
conv = self.conv_bn_layer(
input=conv,
num_filters=128,
filter_size=3,
stride=1,
act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
cardinality=cardinality,
reduction_ratio=reduction_ratio)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop,
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv,
stdv)))
return out
def shortcut(self, input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
filter_size = 1
return self.conv_bn_layer(input, ch_out, filter_size, stride)
else:
return input
def bottleneck_block(self, input, num_filters, stride, cardinality,
reduction_ratio):
conv0 = self.conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
groups=cardinality,
act='relu')
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
scale = self.squeeze_excitation(
input=conv2,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
short = self.shortcut(input, num_filters * 2, stride)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(
input=conv, act=act, is_test=not self.is_train)
def squeeze_excitation(self, input, num_channels, reduction_ratio):
pool = fluid.layers.pool2d(
input=input, pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
squeeze = fluid.layers.fc(input=pool,
size=num_channels / reduction_ratio,
act='relu',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(
-stdv, stdv)))
stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(
-stdv, stdv)))
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
def SE_ResNeXt50_32x4d():
model = SE_ResNeXt(layers=50)
return model
def SE_ResNeXt101_32x4d():
model = SE_ResNeXt(layers=101)
return model
def SE_ResNeXt152_32x4d():
model = SE_ResNeXt(layers=152)
return model
def get_model(args, is_train, main_prog, startup_prog):
model = SE_ResNeXt(layers=50)
batched_reader = None
pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
dshape = train_parameters["input_size"]
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
if args.use_reader_op:
pyreader = fluid.layers.py_reader(
capacity=10,
shapes=([-1] + dshape, (-1, 1)),
dtypes=('float32', 'int64'),
name="train_reader" if is_train else "test_reader",
use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader)
else:
input = fluid.layers.data(
name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
out = model.net(input=input)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
optimizer = None
if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
epochs = [40, 80, 100]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
# learning_rate=base_lr,
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4),
LARS_weight_decay=lars_decay)
optimizer.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# config readers
if is_train:
reader = train()
else:
reader = val()
if not args.use_reader_op:
batched_reader = paddle.batch(
reader, batch_size=args.batch_size * args.gpus, drop_last=True)
else:
pyreader.decorate_paddle_reader(
paddle.batch(
reader, batch_size=args.batch_size))
return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
...@@ -26,7 +26,6 @@ import numpy ...@@ -26,7 +26,6 @@ import numpy
import paddle import paddle
import paddle.dataset.imdb as imdb import paddle.dataset.imdb as imdb
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.batch as batch
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
word_dict = imdb.word_dict() word_dict = imdb.word_dict()
...@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size): ...@@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
return __impl__ return __impl__
def get_model(args): def lstm_net(sentence, lstm_size):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
emb_dim = 512
crop_size = 1500
data = fluid.layers.data(
name="words", shape=[1], lod_level=1, dtype='int64')
sentence = fluid.layers.embedding(
input=data, size=[len(word_dict), emb_dim])
sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
rnn = fluid.layers.DynamicRNN() rnn = fluid.layers.DynamicRNN()
...@@ -97,31 +84,47 @@ def get_model(args): ...@@ -97,31 +84,47 @@ def get_model(args):
last = fluid.layers.sequence_pool(rnn(), 'last') last = fluid.layers.sequence_pool(rnn(), 'last')
logit = fluid.layers.fc(input=last, size=2, act='softmax') logit = fluid.layers.fc(input=last, size=2, act='softmax')
loss = fluid.layers.cross_entropy( return logit
input=logit,
label=fluid.layers.data(
name='label', shape=[1], dtype='int64'))
loss = fluid.layers.mean(x=loss)
# add acc
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
shape=[1], dtype='int64'), total=batch_size_tensor)
inference_program = fluid.default_main_program().clone() def get_model(args, is_train, main_prog, startup_prog):
with fluid.program_guard(inference_program): if args.use_reader_op:
inference_program = fluid.io.get_inference_program( raise Exception(
target_vars=[batch_acc, batch_size_tensor]) "stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
adam = fluid.optimizer.Adam() emb_dim = 512
crop_size = 1500
train_reader = batch( with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
data = fluid.layers.data(
name="words", shape=[1], lod_level=1, dtype='int64')
sentence = fluid.layers.embedding(
input=data, size=[len(word_dict), emb_dim])
logit = lstm_net(sentence, lstm_size)
loss = fluid.layers.cross_entropy(
input=logit,
label=fluid.layers.data(
name='label', shape=[1], dtype='int64'))
loss = fluid.layers.mean(x=loss)
# add acc
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
shape=[1], dtype='int64'), total=batch_size_tensor)
if is_train:
adam = fluid.optimizer.Adam()
adam.minimize(loss)
if is_train:
reader = crop_sentence(imdb.train(word_dict), crop_size)
else:
reader = crop_sentence(imdb.test(word_dict), crop_size)
batched_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000), reader, buf_size=25000),
batch_size=args.batch_size * args.gpus) batch_size=args.batch_size * args.gpus)
test_reader = batch(
paddle.reader.shuffle(
crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
batch_size=args.batch_size)
return loss, inference_program, adam, train_reader, test_reader, batch_acc return loss, adam, [batch_acc], batched_reader, None
...@@ -25,7 +25,7 @@ import functools ...@@ -25,7 +25,7 @@ import functools
import os import os
def vgg16_bn_drop(input): def vgg16_bn_drop(input, is_train=True):
def conv_block(input, num_filter, groups, dropouts): def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group( return fluid.nets.img_conv_group(
input=input, input=input,
...@@ -46,13 +46,13 @@ def vgg16_bn_drop(input): ...@@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None) fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu') bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None) fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2 return fc2
def get_model(args): def get_model(args, is_train, main_prog, startup_prog):
if args.data_set == "cifar10": if args.data_set == "cifar10":
classdim = 10 classdim = 10
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
...@@ -65,57 +65,56 @@ def get_model(args): ...@@ -65,57 +65,56 @@ def get_model(args):
data_shape = [3, 224, 224] data_shape = [3, 224, 224]
else: else:
data_shape = [224, 224, 3] data_shape = [224, 224, 3]
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + data_shape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=1,
pass_num=1)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file_handle, batch_size=args.batch_size))
with fluid.unique_name.guard():
if args.use_reader_op:
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='data', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images, is_train=is_train)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
if args.use_reader_op: # Evaluator
filelist = [ batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
os.path.join(args.data_path, f) for f in os.listdir(args.data_path) batch_acc = fluid.layers.accuracy(
] input=predict, label=label, total=batch_size_tensor)
data_file = fluid.layers.open_files( # Optimization
filenames=filelist, if is_train:
shapes=[[-1] + data_shape, (-1, 1)], optimizer = fluid.optimizer.Adam(
lod_levels=[0, 0], learning_rate=args.learning_rate)
dtypes=["float32", "int64"], optimizer.minimize(avg_cost)
thread_num=args.gpus,
pass_num=args.pass_num)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file, batch_size=args.batch_size))
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='data', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
# data reader # data reader
train_reader = paddle.batch( if is_train:
reader = paddle.dataset.cifar.train10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
else:
reader = paddle.dataset.cifar.test10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
batched_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.cifar.train10() reader, buf_size=5120),
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size * args.gpus) batch_size=args.batch_size * args.gpus)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=args.batch_size)
return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
...@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR}) ...@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so) set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so) set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
include_directories(${ANAKIN_INCLUDE}) include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/) include_directories(${ANAKIN_INCLUDE}/saber/)
include_directories(${ANAKIN_INCLUDE}/saber/core/) include_directories(${ANAKIN_INCLUDE}/saber/core/)
...@@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS ...@@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder -Wno-reorder
-Wno-error=cpp) -Wno-error=cpp)
if(WITH_GPU)
set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
else()
set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
endif()
ExternalProject_Add( ExternalProject_Add(
extern_anakin extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLML_PROJECT} DEPENDS ${MKLML_PROJECT}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin" GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin"
GIT_TAG "9424277cf9ae180a14aff09560d3cd60a49c76d2" GIT_TAG "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
PREFIX ${ANAKIN_SOURCE_DIR} PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES CMAKE_ARGS ${CMAKE_ARGS_PREFIX}
-DUSE_X86_PLACE=YES -DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO -DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DCUDNN_ROOT=${CUDNN_ROOT}
-DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
-DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER} -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
${EXTERNAL_OPTIONAL_ARGS} ${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
......
...@@ -145,12 +145,12 @@ copy(memory_lib ...@@ -145,12 +145,12 @@ copy(memory_lib
set(inference_deps paddle_fluid_shared paddle_fluid) set(inference_deps paddle_fluid_shared paddle_fluid)
set(module "inference/api") set(module "inference/api")
if (WITH_ANAKIN AND WITH_GPU) if (WITH_ANAKIN AND WITH_MKL)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${ANAKIN_INSTALL_DIR} # anakin release ${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin) DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
list(APPEND inference_deps anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif() endif()
......
...@@ -822,6 +822,14 @@ pad ...@@ -822,6 +822,14 @@ pad
.. autofunction:: paddle.fluid.layers.pad .. autofunction:: paddle.fluid.layers.pad
:noindex: :noindex:
.. _api_fluid_layers_pad_constant_like:
pad_constant_like
---
.. autofunction:: paddle.fluid.layers.pad_constant_like
:noindex:
.. _api_fluid_layers_label_smooth: .. _api_fluid_layers_label_smooth:
label_smooth label_smooth
...@@ -1145,6 +1153,14 @@ sigmoid ...@@ -1145,6 +1153,14 @@ sigmoid
.. autofunction:: paddle.fluid.layers.sigmoid .. autofunction:: paddle.fluid.layers.sigmoid
:noindex: :noindex:
.. _api_fluid_layers_hsigmoid:
hsigmoid
-------
.. autofunction:: paddle.fluid.layers.hsigmoid
:noindex:
.. _api_fluid_layers_logsigmoid: .. _api_fluid_layers_logsigmoid:
logsigmoid logsigmoid
......
...@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080 ...@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080
# 访问 http://127.0.0.1:8080 # 访问 http://127.0.0.1:8080
``` ```
如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上,运行`pip install --upgrade protobuf`就能解决。
如果在虚拟环境下仍然遇到安装问题,请尝试以下方法。 如果在虚拟环境下仍然遇到安装问题,请尝试以下方法。
......
...@@ -4,13 +4,12 @@ Paddle 预测 API ...@@ -4,13 +4,12 @@ Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API 为了更简单方便的预测部署,Fluid 提供了一套高层 API
用来隐藏底层不同的优化实现。 用来隐藏底层不同的优化实现。
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__ `预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
包括 包括
- 头文件 ``paddle_inference_api.h`` 定义了所有的接口 - 头文件 ``paddle_inference_api.h`` 定义了所有的接口
- 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a`` - 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
- 库文件 ``libpaddle_inference_api.so`` 或
``libpaddle_inference_api.a``
编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
...@@ -97,8 +96,7 @@ engine ...@@ -97,8 +96,7 @@ engine
CHECK(predictor->Run(slots, &outputs)); CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ... // 获取 outputs ...
编译时,联编 ``libpaddle_fluid.a/.so`` 和 编译时,联编 ``libpaddle_fluid.a/.so`` 便可。
``libpaddle_inference_api.a/.so`` 便可。
详细代码参考 详细代码参考
------------ ------------
......
...@@ -2,42 +2,47 @@ ...@@ -2,42 +2,47 @@
## Automatic Differentiation ## Automatic Differentiation
A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers. Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf). A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning.
## The Tape ## Program Transformation v.s. Backtracking
Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass: Given the forward pass program, there are two strategies to derive the backward pass:
1. from the forward pass program itself, or 1. by transforming the forward pass program without executing it, or
1. from the execution trace of the forward pass program, which is often known as the *tape*. 1. by backtracking the execution process of the forward pass program.
This article surveys systems that follow the latter strategy. This article is about the latter strategy.
## Dynamic Network ## The Tape and Dynamic Networks
When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration. This is known as *dynamic network*. We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf). When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations. Such changes are known as *dynamic networks* in the field of deep learning.
Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years. This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/). ## Typical Systems
## An Overview Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years. This article surveys the following typical systems:
Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf) - [DyNet](https://dynet.readthedocs.io/en/latest/)
- [PyTorch](https://pytorch.org/)
- Chainer
- Autograd from HIPS
Consider the following code feedforward model. Before diving into these systems, let us pose an example forward pass program:
```python ```python
x = Variable(randn(20, 1))) x = Variable(randn(20, 1)))
label = Variable(randint(1)) label = Variable(randint(1))
W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20)) W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
h = matmul(W_1, x) h = matmul(W_1, x)
pred = matmul(W_2, x) pred = matmul(W_2, h)
loss = softmax(pred, label) loss = softmax(pred, label)
loss.backward() loss.backward()
``` ```
### 1) Dynet uses List to encode the Tape ## The Representation of Tapes
During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`. ### DyNet: the Tape as a List
DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`. The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().`
<details> <details>
<summary></summary> <summary></summary>
...@@ -69,9 +74,9 @@ digraph g { ...@@ -69,9 +74,9 @@ digraph g {
![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20}) ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
### 2) Pytorch uses Node Graph to encode the Tape ### PyTorch: the Tape as a Graph
The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order. The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order. Please be aware that a `Function` might have more than one `prev_func`s.
<details> <details>
<summary></summary> <summary></summary>
...@@ -132,27 +137,22 @@ digraph g { ...@@ -132,27 +137,22 @@ digraph g {
![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20}) ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix. Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix.
## Design choices
### 1) Dynet's List vs Pytorch's Node Graph ## Comparison: List v.s. Graph
What's good about List: The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`:
1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
What's good about Node Graph:
1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
```python ```python
result = BigNet(data) result = BigNet(data)
loss = SmallNet(data) loss = SmallNet(data)
loss.backward() loss.backward()
``` ```
### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation ## Lazy v.s. Immediate Evaluation
Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example:
Dynet builds the list in a symbolic matter. Consider the following example
```python ```python
for epoch in range(num_epochs): for epoch in range(num_epochs):
for in_words, out_label in training_data: for in_words, out_label in training_data:
...@@ -164,16 +164,17 @@ for epoch in range(num_epochs): ...@@ -164,16 +164,17 @@ for epoch in range(num_epochs):
loss_val = loss_sym.value() loss_val = loss_sym.value()
loss_sym.backward() loss_sym.backward()
``` ```
The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion. The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`. PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
## What can fluid learn from them? ## Fluid: Learning the Lessons
Please refer to `paddle/contrib/dynamic/`. Please refer to `paddle/contrib/dynamic/`.
# Appendix ## Appendix
### Overview ### Overview
......
...@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', ...@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list',
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None) paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
...@@ -65,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla ...@@ -65,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0)) paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
...@@ -312,7 +313,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw ...@@ -312,7 +313,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -376,7 +377,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l ...@@ -376,7 +377,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
...@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
ir::Graph &result = *graph; ir::Graph &result = *graph;
for (auto &node : nodes) { for (auto &node : nodes) {
if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) { if (node->IsVar() && node->Var()) {
all_vars_.emplace(node->Name(), node->Var()); all_vars_.emplace(node->Name(), node->Var());
} }
} }
...@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ...@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
} }
} }
bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const {
bool is_pg_once =
grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
if (is_pg_once) {
// Insert NCCL AllReduce Op
og_has_been_broadcast->insert(og);
}
return is_pg_once;
}
int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
ir::Node *node) const { ir::Node *node) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
...@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, ...@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var; return var;
} }
// Find the first occurence of `prev_op_name` and make current `op` depend
// on it.
void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const {
for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
if (prev_op->Name() == prev_op_name) {
auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
prev_op->AddOutput(dep_var);
result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
op->AddInput(dep_var);
}
}
}
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
int op_dev_id = -1; int op_dev_id = -1;
......
...@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
std::vector<std::string> FindDistTrainRecvVars( std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) const; const std::vector<ir::Node *> &nodes) const;
void ConnectOp(ir::Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
...@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void CreateComputationalOp(ir::Graph *result, ir::Node *node, void CreateComputationalOp(ir::Graph *result, ir::Node *node,
int dev_id) const; int dev_id) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
......
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n")
file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
function(pass_library TARGET)
# Usage: pass_library(target inference) will append to paddle_inference_pass.h
function(pass_library TARGET DEST)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass) cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE) if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
message(STATUS "add pass ${TARGET} ${DEST}")
file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
endif()
endfunction() endfunction()
cc_library(node SRCS node.cc DEPS proto_desc) cc_library(node SRCS node.cc DEPS proto_desc)
...@@ -18,13 +25,15 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper) ...@@ -18,13 +25,15 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
pass_library(graph_to_program_pass) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass) pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass) pass_library(infer_clean_graph_pass inference)
pass_library(fc_lstm_fuse_pass) pass_library(fc_lstm_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass) pass_library(fc_gru_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass inference)
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
......
...@@ -13,13 +13,10 @@ ...@@ -13,13 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
namespace paddle {
namespace framework {
namespace ir {
static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
bool with_fc_bias) {
PDNode* x = pattern->NewNode(name_scope, "x")
->assert_is_op_input("mul")
->assert_var_not_persistable();
auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
patterns::GRU(pattern, name_scope, fc_out);
VLOG(3) << "fc_gru pattern \n" << pattern->DotString();
}
static int BuildFusion(Graph* graph, const std::string& name_scope,
Scope* scope, bool with_fc_bias) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
BuildPattern(pattern, name_scope, with_fc_bias);
// Create New OpDesc
auto gru_creater = [&](int gru, int x, int weight_x, int weight_h, int bias,
int hidden, int fc_bias) {
#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
GET_NODE(x);
GET_NODE(weight_x);
GET_NODE(weight_h);
GET_NODE(bias);
GET_NODE(hidden);
GET_NODE(gru);
OpDesc op_desc;
op_desc.SetType("fusion_gru");
#define NEW_NAME(x) name_scope + "/at." #x ".new"
#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
SET_IN(X, x);
SET_IN(WeightX, weight_x);
SET_IN(WeightH, weight_h);
if (with_fc_bias) {
op_desc.SetInput("Bias", {NEW_NAME(bias) + bias_n->Name()});
} else {
SET_IN(Bias, bias);
}
#undef SET_IN
op_desc.SetInput("H0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()});
op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse"));
// TODO(TJ): This should be a option for infer
op_desc.SetAttr("use_seq", true);
#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
SET_IMTERMEDIATE_OUT(ReorderedH0);
SET_IMTERMEDIATE_OUT(XX);
SET_IMTERMEDIATE_OUT(BatchedInput);
SET_IMTERMEDIATE_OUT(BatchedOut);
#undef SET_IMTERMEDIATE_OUT
auto* op = graph->CreateOpNode(&op_desc);
PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
auto* scope = graph->Get<Scope*>(kParamScopeAttr);
PADDLE_ENFORCE(scope);
if (with_fc_bias) {
// Fusion GRU bias = fcbias + grubias
auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias_n->Name());
auto* out_bias_tensor =
fusion_bias_var->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE(fusion_bias_var);
GET_NODE(fc_bias);
PADDLE_ENFORCE(fc_bias_n);
auto* gru_bias_var = scope->FindVar(bias_n->Name());
auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
PADDLE_ENFORCE(gru_bias_var);
PADDLE_ENFORCE(fc_bias_var);
const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
// new bias = fc bias + gru bias
out_bias_tensor->Resize(gru_bias_tenosr.dims());
auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < out_bias_tensor->numel(); i++) {
data[i] =
fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
}
}
#undef GET_NODE
#define NEW_IMTERMEDIATE_OUT(key) \
scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
NEW_IMTERMEDIATE_OUT(ReorderedH0);
NEW_IMTERMEDIATE_OUT(XX);
NEW_IMTERMEDIATE_OUT(BatchedInput);
NEW_IMTERMEDIATE_OUT(BatchedOut);
#undef NEW_NAME
#undef NEW_IMTERMEDIATE_OUT
IR_NODE_LINK_TO(x_n, op);
IR_NODE_LINK_TO(weight_x_n, op);
IR_NODE_LINK_TO(weight_h_n, op);
IR_NODE_LINK_TO(bias_n, op); // actually should link to new bias if have
IR_NODE_LINK_TO(op, hidden_n);
// h0?
return op;
};
int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
#define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \
PADDLE_ENFORCE(name__##n); \
PADDLE_ENFORCE(subgraph.count(name__##n)); \
Node* name__##_n = subgraph.at(name__##n); \
int name__ __attribute__((unused)) = name__##_n->id();
GET_NODE(x);
GET_NODE(w); // fc weight
GET_NODE(mul);
GET_NODE(fc_out);
GET_NODE(Weight);
GET_NODE(gru);
GET_NODE(Bias);
GET_NODE(Hidden);
// nodes need be removed
GET_NODE(BatchGate);
GET_NODE(BatchResetHiddenPrev);
GET_NODE(BatchHidden);
if (with_fc_bias) {
GET_NODE(mul_out);
GET_NODE(fc_bias);
GET_NODE(elementwise_add);
gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul_n, gru_n, elementwise_add_n, fc_bias_n, fc_out_n, mul_out_n,
BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
GraphSafeRemoveNodes(graph, marked_nodes);
} else {
gru_creater(gru, x, w, Weight, Bias, Hidden, -1);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul_n, gru_n, BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n});
GraphSafeRemoveNodes(graph, marked_nodes);
}
#undef GET_NODE
++fusion_count;
};
gpd(graph, handler);
return fusion_count;
}
std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
false /*with_fc_bias*/);
AddStatis(fusion_count);
return graph;
}
std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
true /*with_fc_bias*/);
AddStatis(fusion_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
class FCGRUFusePass : public FusePassBase {
public:
virtual ~FCGRUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_gru_fuse"};
};
// Just FC without bias
class MulGRUFusePass : public FusePassBase {
public:
virtual ~MulGRUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_nobias_gru_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -20,12 +20,13 @@ namespace paddle { ...@@ -20,12 +20,13 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
std::string GenNodeName(const std::string& prefix, const std::string& name) { static std::string GenNodeName(const std::string& prefix,
const std::string& name) {
return prefix + "/" + name; return prefix + "/" + name;
} }
void BuildPattern(PDPattern* pattern, const std::string& name_scope, static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
bool with_fc_bias) { bool with_fc_bias) {
PDNode* x = pattern->NewNode(name_scope, "x") PDNode* x = pattern->NewNode(name_scope, "x")
->assert_is_op_input("mul") ->assert_is_op_input("mul")
->assert_var_not_persistable(); ->assert_var_not_persistable();
...@@ -35,8 +36,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope, ...@@ -35,8 +36,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope,
// LOG(INFO) << "\n" << pattern->DotString(); // LOG(INFO) << "\n" << pattern->DotString();
} }
int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, static int BuildFusion(Graph* graph, const std::string& name_scope,
bool with_fc_bias) { Scope* scope, bool with_fc_bias) {
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern(); auto* pattern = gpd.mutable_pattern();
...@@ -87,15 +88,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -87,15 +88,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
} }
op_desc.SetInput("Bias", {new_bias_var}); op_desc.SetInput("Bias", {new_bias_var});
} }
#undef GET_NODE #undef GET_NODE
// Create temp variables.
scope->Var(name_scope + "/BatchedInput.new")
->GetMutable<framework::LoDTensor>();
scope->Var(name_scope + "/BatchCellPreAct.new")
->GetMutable<framework::LoDTensor>();
scope->Var(name_scope + "/BatchedGate.new")
->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {}); op_desc.SetInput("C0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Hidden", {hidden_n->Name()});
op_desc.SetOutput("Cell", {cell_n->Name()}); op_desc.SetOutput("Cell", {cell_n->Name()});
op_desc.SetOutput("XX", {xx_n->Name()}); op_desc.SetOutput("XX", {xx_n->Name()});
op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"}); op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes")); op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr // TODO(TJ): get from attr
...@@ -131,8 +141,8 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -131,8 +141,8 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
int fusion_count{0}; int fusion_count{0};
auto fc_no_bias_handler = [&]( auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { Graph* g) {
#define GET_NODE(name__) \ #define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \ std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \ auto* name__##n = pattern->RetrieveNode(name__##key); \
...@@ -153,21 +163,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, ...@@ -153,21 +163,24 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
if (with_fc_bias) { if (with_fc_bias) {
GET_NODE(fc_bias); GET_NODE(fc_bias);
GET_NODE(elementwise_add);
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias); lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul_n, lstm_n, elementwise_add_n});
GraphSafeRemoveNodes(graph, marked_nodes);
} else { } else {
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1); lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
GraphSafeRemoveNodes(graph, marked_nodes);
} }
#undef GET_NODE #undef GET_NODE
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
GraphSafeRemoveNodes(graph, marked_nodes);
++fusion_count; ++fusion_count;
}; };
gpd(graph, fc_no_bias_handler); gpd(graph, handler);
return fusion_count; return fusion_count;
} }
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
......
...@@ -73,7 +73,6 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) { ...@@ -73,7 +73,6 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
void GraphPatternDetector::operator()(Graph* graph, void GraphPatternDetector::operator()(Graph* graph,
GraphPatternDetector::handle_t handler) { GraphPatternDetector::handle_t handler) {
if (!MarkPDNodesInGraph(*graph)) { if (!MarkPDNodesInGraph(*graph)) {
LOG(INFO) << "Mark failed";
return; return;
} }
...@@ -86,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph, ...@@ -86,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph,
LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
int id = 0; int id = 0;
for (auto& g : subgraphs) { for (auto& g : subgraphs) {
LOG(INFO) << "optimizing #" << id++ << " subgraph"; VLOG(3) << "optimizing #" << id++ << " subgraph";
handler(g, graph); handler(g, graph);
} }
} }
...@@ -520,76 +519,96 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { ...@@ -520,76 +519,96 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
PDNode* x, bool with_bias) { PDNode* x, bool with_bias) {
// Create Operators // mul op
PDNode* elementwise_add_op{nullptr};
auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul"); auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
if (with_bias) {
elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
->assert_is_op("elementwise_add");
}
// Create variables
// w
auto* mul_weight_var = pattern->NewNode(name_scope, "w") auto* mul_weight_var = pattern->NewNode(name_scope, "w")
->AsInput() ->AsInput()
->assert_is_persistable_var() ->assert_is_persistable_var()
->assert_is_op_nth_input("mul", "Y", 0); ->assert_is_op_input("mul", "Y");
PDNode* mul_out_var{nullptr};
PDNode* fc_out{nullptr};
if (with_bias) { if (with_bias) {
PDNode* elementwise_add_op{nullptr};
PDNode *mul_out_var{nullptr}, *bias{nullptr};
elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
->assert_is_op("elementwise_add");
// intermediate variable, will be removed in the IR after fuse. // intermediate variable, will be removed in the IR after fuse.
mul_out_var = pattern->NewNode(name_scope, "mul_out") mul_out_var = pattern->NewNode(name_scope, "mul_out")
->AsIntermediate() ->AsIntermediate()
->assert_is_only_output_of_op("mul") ->assert_is_only_output_of_op("mul")
->assert_is_op_input("elementwise_add"); ->assert_is_op_input("elementwise_add");
}
PDNode *bias{nullptr}, *fc_out{nullptr};
if (with_bias) {
// bias // bias
bias = pattern->NewNode(name_scope, "fc_bias") bias = pattern->NewNode(name_scope, "fc_bias")
->assert_is_op_input("elementwise_add") ->AsInput()
->AsInput(); ->assert_is_op_input("elementwise_add");
// output // output
fc_out = pattern->NewNode(name_scope, "fc_out") fc_out = pattern->NewNode(name_scope, "fc_out")
->AsOutput() ->AsOutput()
->assert_is_op_output("elementwise_add"); ->assert_is_op_output("elementwise_add");
mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
} else { } else {
fc_out = pattern->NewNode(name_scope, "fc_out") fc_out = pattern->NewNode(name_scope, "fc_out")
->AsOutput() ->AsOutput()
->assert_is_op_output("mul"); ->assert_is_op_output("mul");
}
if (with_bias) {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
} else {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out}); mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
} }
return fc_out; return fc_out;
} }
#define NEW_NODE(op__, arg__, io__) \
auto* arg__ = pattern->NewNode(name_scope, #arg__) \
->assert_is_op_##io__(#op__, #arg__);
PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
PDNode* x) { PDNode* x) {
x->assert_is_op_input("lstm", "Input"); x->assert_is_op_input("lstm", "Input");
auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm"); auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
#define NEW_NODE(arg__, io__) \
auto* arg__ = pattern->NewNode(name_scope, #arg__) \
->assert_is_op_##io__("lstm", #arg__);
// Currently, the H0 and C0 are optional // Currently, the H0 and C0 are optional
// TODO(Superjomn) upgrade the fuse framework to support optional. // TODO(Superjomn) upgrade the fuse framework to support optional.
// NEW_NODE(H0, input); // NEW_NODE(H0, input);
// NEW_NODE(C0, input); // NEW_NODE(C0, input);
NEW_NODE(Weight, input); NEW_NODE(lstm, Weight, input);
NEW_NODE(Bias, input); NEW_NODE(lstm, Bias, input);
NEW_NODE(Hidden, output); NEW_NODE(lstm, Hidden, output);
NEW_NODE(Cell, output); NEW_NODE(lstm, Cell, output);
NEW_NODE(BatchGate, output); NEW_NODE(lstm, BatchGate, output);
NEW_NODE(BatchCellPreAct, output); NEW_NODE(lstm, BatchCellPreAct, output);
lstm_op->LinksFrom({x, Weight, Bias}); lstm_op->LinksFrom({x, Weight, Bias});
lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct}); lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
return Hidden; return Hidden;
} }
PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope,
PDNode* x) {
x->assert_is_op_input("gru", "Input");
auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru");
NEW_NODE(gru, Weight, input);
// TODO(Superjomn): upgrade the fuse framework to support optional.
// H0 and bias are optional
NEW_NODE(gru, Bias, input); // also optional
// NEW_NODE(H0, input);
NEW_NODE(gru, Hidden, output);
// below are intermediate
NEW_NODE(gru, BatchGate, output);
NEW_NODE(gru, BatchResetHiddenPrev, output);
NEW_NODE(gru, BatchHidden, output);
BatchGate->AsIntermediate();
BatchResetHiddenPrev->AsIntermediate();
BatchHidden->AsIntermediate();
gru_op->LinksFrom({x, Weight, Bias});
gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
return Hidden;
}
#undef NEW_NODE
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
#endif #endif
#include <numeric> #include <numeric>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
...@@ -295,6 +298,8 @@ PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x, ...@@ -295,6 +298,8 @@ PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
PDNode* GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x);
} // namespace patterns } // namespace patterns
#define IR_NODE_LINK_TO(a, b) \ #define IR_NODE_LINK_TO(a, b) \
......
...@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( ...@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
Dot dot; Dot dot;
std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"), const std::vector<Dot::Attr> op_attrs({
Dot::Attr("shape", "box"), Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("fillcolor", "red")}); Dot::Attr("shape", "box"), //
std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"), Dot::Attr("color", "#303A3A"), //
// Dot::Attr("shape", "diamond"), Dot::Attr("fontcolor", "#ffffff"), //
Dot::Attr("fillcolor", "yellow")}); Dot::Attr("width", "1.3"), //
Dot::Attr("height", "0.84"), //
std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"), Dot::Attr("fontname", "Arial"), //
Dot::Attr("shape", "box"), });
Dot::Attr("fillcolor", "lightgray")}); const std::vector<Dot::Attr> arg_attrs({
std::vector<Dot::Attr> marked_var_attrs( Dot::Attr("shape", "box"), //
{Dot::Attr("style", "filled,rounded"), Dot::Attr("style", "rounded,filled,bold"), //
// Dot::Attr("shape", "diamond"), Dot::Attr("fontname", "Arial"), //
Dot::Attr("fillcolor", "lightgray")}); Dot::Attr("fillcolor", "#999999"), //
Dot::Attr("color", "#dddddd"), //
});
const std::vector<Dot::Attr> param_attrs({
Dot::Attr("shape", "box"), //
Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("fontname", "Arial"), //
Dot::Attr("color", "#148b97"), //
Dot::Attr("fontcolor", "#ffffff"), //
});
const std::vector<Dot::Attr> marked_op_attrs(
{Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "yellow")});
const std::vector<Dot::Attr> marked_var_attrs(
{Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "yellow")});
auto marked_nodes = ConsumeMarkedNodes(graph.get()); auto marked_nodes = ConsumeMarkedNodes(graph.get());
// Create nodes // Create nodes
...@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( ...@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
marked_nodes.count(n) ? marked_op_attrs : op_attrs; marked_nodes.count(n) ? marked_op_attrs : op_attrs;
dot.AddNode(node_id, attr, node_id); dot.AddNode(node_id, attr, node_id);
} else if (n->IsVar()) { } else if (n->IsVar()) {
decltype(op_attrs) attr = decltype(op_attrs)* attr;
marked_nodes.count(n) ? marked_var_attrs : var_attrs; if (marked_nodes.count(n)) {
dot.AddNode(node_id, attr, node_id); attr = &marked_var_attrs;
} else if (const_cast<Node*>(n)->Var() &&
const_cast<Node*>(n)->Var()->Persistable()) {
attr = &param_attrs;
} else {
attr = &arg_attrs;
}
dot.AddNode(node_id, *attr, node_id);
} }
node2dot[n] = node_id; node2dot[n] = node_id;
} }
......
...@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits ...@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits
analyzer.cc analyzer.cc
helper.cc helper.cc
# passes # passes
analysis_pass.cc
fluid_to_data_flow_graph_pass.cc fluid_to_data_flow_graph_pass.cc
data_flow_graph_to_fluid_pass.cc data_flow_graph_to_fluid_pass.cc
dfg_graphviz_draw_pass.cc dfg_graphviz_draw_pass.cc
...@@ -58,7 +59,7 @@ endif() ...@@ -58,7 +59,7 @@ endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
--infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
...@@ -74,25 +75,42 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) ...@@ -74,25 +75,42 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING) if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
endif() endif()
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE) set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING) if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz") inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz") inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
endif() endif()
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_model=${LAC_INSTALL_DIR}/model ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt) --infer_data=${LAC_INSTALL_DIR}/data.txt)
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
endif()
inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
--topn=1 # Just run top 1 batch.
)
...@@ -12,4 +12,4 @@ ...@@ -12,4 +12,4 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
...@@ -28,10 +28,10 @@ namespace paddle { ...@@ -28,10 +28,10 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
class Pass { class AnalysisPass {
public: public:
Pass() = default; AnalysisPass() = default;
virtual ~Pass() = default; virtual ~AnalysisPass() = default;
// Mutable Pass. // Mutable Pass.
virtual bool Initialize(Argument *argument) { return false; } virtual bool Initialize(Argument *argument) { return false; }
// Readonly Pass. // Readonly Pass.
...@@ -42,23 +42,16 @@ class Pass { ...@@ -42,23 +42,16 @@ class Pass {
virtual bool Finalize() { return false; } virtual bool Finalize() { return false; }
// Get a Pass appropriate to print the Node this pass operates on. // Get a Pass appropriate to print the Node this pass operates on.
virtual Pass *CreatePrinterPass(std::ostream &os, virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
const std::string &banner) const { const std::string &banner) const {
return nullptr; return nullptr;
} }
// Create a debugger Pass that draw the DFG by graphviz toolkit. // Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; } virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
virtual void Run() { LOG(FATAL) << "not valid"; }
// Run on a single Node.
virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
// Run on a single Function.
virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
// Run on a single FunctionBlock.
virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
// Run on a single DataFlowGraph. // Run on a single DataFlowGraph.
virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; } virtual void Run(DataFlowGraph *x) = 0;
// Human-readable short representation. // Human-readable short representation.
virtual std::string repr() const = 0; virtual std::string repr() const = 0;
...@@ -66,29 +59,8 @@ class Pass { ...@@ -66,29 +59,8 @@ class Pass {
virtual std::string description() const { return "No DOC"; } virtual std::string description() const { return "No DOC"; }
}; };
// NodePass process on any Node types.
class NodePass : public Pass {
public:
virtual void Run(Node *node) = 0;
};
// NodePass process on any Function node types.
class FunctionPass : public Pass {
public:
virtual void Run(Function *node) = 0;
};
// NodePass process on any FunctionBlock node types.
class FunctionBlockPass : public Pass {
public:
virtual void Run(FunctionBlock *node) = 0;
};
// GraphPass processes on any GraphType. // GraphPass processes on any GraphType.
class DataFlowGraphPass : public Pass { class DataFlowGraphPass : public AnalysisPass {};
public:
virtual void Run(DataFlowGraph *graph) = 0;
};
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
...@@ -41,27 +43,23 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -41,27 +43,23 @@ class DfgPassManagerImpl final : public DfgPassManager {
public: public:
DfgPassManagerImpl() { DfgPassManagerImpl() {
// TODO(Superjomn) set the key with pass reprs. // TODO(Superjomn) set the key with pass reprs.
LOG(INFO) if (!FLAGS_IA_enable_ir) {
<< "-----------------------------------------------------------------";
if (FLAGS_IA_enable_ir) {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
} else {
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
} else {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
} }
TryAddTensorRtPass(); TryAddTensorRtPass();
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
if (!FLAGS_IA_output_storage_path.empty()) { if (!FLAGS_IA_output_storage_path.empty()) {
AddPass("model-store-pass", new ModelStorePass); AddPass("model-store-pass", new ModelStorePass);
} }
LOG(INFO)
<< "-----------------------------------------------------------------";
} }
std::string repr() const override { return "dfg-pass-manager"; } std::string repr() const override { return "dfg-pass-manager"; }
std::string description() const override { return "DFG pass manager."; } std::string description() const override { return "DFG pass manager."; }
private: private:
void AddPass(const std::string& name, Pass* pass) { void AddPass(const std::string& name, AnalysisPass* pass) {
VLOG(3) << "Adding pass " << name; VLOG(3) << "Adding pass " << name;
Register(name, pass); Register(name, pass);
AddGraphvizDebugerPass(pass); AddGraphvizDebugerPass(pass);
...@@ -90,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -90,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
} }
// Add the graphviz debuger pass if the parent pass has one. // Add the graphviz debuger pass if the parent pass has one.
void AddGraphvizDebugerPass(Pass* pass) { void AddGraphvizDebugerPass(AnalysisPass* pass) {
auto* debuger_pass = pass->CreateGraphvizDebugerPass(); auto* debuger_pass = pass->CreateGraphvizDebugerPass();
if (debuger_pass) { if (debuger_pass) {
Register(debuger_pass->repr(), debuger_pass); Register(debuger_pass->repr(), debuger_pass);
...@@ -101,19 +99,15 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -101,19 +99,15 @@ class DfgPassManagerImpl final : public DfgPassManager {
Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
void Analyzer::Run(Argument* argument) { void Analyzer::Run(Argument* argument) {
// Ugly support fluid-to-ir-pass std::vector<std::string> passes;
argument->Set(kFluidToIrPassesAttr, for (auto& pass : all_ir_passes_) {
new std::vector<std::string>({ if (!disabled_ir_passes_.count(pass)) {
// Manual update the passes here. passes.push_back(pass);
"graph_viz_pass", // passes.push_back("graph_viz_pass"); // add graphviz for debug.
"infer_clean_graph_pass", "graph_viz_pass", // }
"attention_lstm_fuse_pass", "graph_viz_pass", // }
"fc_lstm_fuse_pass", "graph_viz_pass", // passes.push_back("graph_viz_pass");
"mul_lstm_fuse_pass", "graph_viz_pass", // argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
"seq_concat_fc_fuse_pass", "graph_viz_pass", //
"fc_fuse_pass", "graph_viz_pass" //
}));
for (auto& x : data_) { for (auto& x : data_) {
PADDLE_ENFORCE(x->Initialize(argument)); PADDLE_ENFORCE(x->Initialize(argument));
...@@ -122,6 +116,11 @@ void Analyzer::Run(Argument* argument) { ...@@ -122,6 +116,11 @@ void Analyzer::Run(Argument* argument) {
} }
} }
Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
disabled_ir_passes_.insert(passes.begin(), passes.end());
return *this;
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -36,16 +36,12 @@ limitations under the License. */ ...@@ -36,16 +36,12 @@ limitations under the License. */
*/ */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include "paddle/fluid/inference/analysis/pass.h" #include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(IA_graphviz_log_root);
DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
...@@ -57,7 +53,28 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -57,7 +53,28 @@ class Analyzer : public OrderedRegistry<PassManager> {
void Run(Argument* argument); void Run(Argument* argument);
Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
DISABLE_COPY_AND_ASSIGN(Analyzer); DISABLE_COPY_AND_ASSIGN(Analyzer);
private:
// All avaiable IR passes.
// The bigger fuse comes first, so that the small operators prefer to be
// merged in a larger fuse op. The small fusion will not break the pattern of
// larger fusion.
const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here.
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
}};
std::unordered_set<std::string> disabled_ir_passes_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -11,13 +11,14 @@ ...@@ -11,13 +11,14 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC"); DEFINE_string(infer_model, "", "model path for LAC");
...@@ -102,6 +103,7 @@ struct DataRecord { ...@@ -102,6 +103,7 @@ struct DataRecord {
return data; return data;
} }
}; };
void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) { int batch_size) {
auto one_batch = data->NextBatch(); auto one_batch = data->NextBatch();
...@@ -114,12 +116,7 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -114,12 +116,7 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1)); PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
input_slots->assign({input_tensor}); input_slots->assign({input_tensor});
} }
static void PrintTime(const double latency, const int bs, const int repeat) {
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
<< ", avg latency: " << latency / repeat << "ms";
LOG(INFO) << "=====================================";
}
void BenchAllData(const std::string &model_path, const std::string &data_file, void BenchAllData(const std::string &model_path, const std::string &data_file,
const int batch_size, const int repeat) { const int batch_size, const int repeat) {
NativeConfig config; NativeConfig config;
...@@ -145,19 +142,18 @@ void BenchAllData(const std::string &model_path, const std::string &data_file, ...@@ -145,19 +142,18 @@ void BenchAllData(const std::string &model_path, const std::string &data_file,
sum += timer.toc(); sum += timer.toc();
} }
} }
PrintTime(sum, batch_size, repeat); PrintTime(batch_size, repeat, 1, 0, sum / repeat);
} }
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
void TestLACPrediction(const std::string &model_path, void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size, const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data) { const int repeat, bool test_all_data,
if (test_all_data) { bool use_analysis = false) {
BenchAllData(model_path, data_file, batch_size, repeat);
return;
}
NativeConfig config; NativeConfig config;
config.model_dir = model_path; config.model_dir = model_path;
config.use_gpu = false; config.use_gpu = false;
...@@ -166,17 +162,47 @@ void TestLACPrediction(const std::string &model_path, ...@@ -166,17 +162,47 @@ void TestLACPrediction(const std::string &model_path,
std::vector<PaddleTensor> input_slots, outputs_slots; std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size); DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size); GetOneBatch(&input_slots, &data, batch_size);
auto predictor = std::unique_ptr<PaddlePredictor> predictor;
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); if (use_analysis) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}
for (int i = 0; i < FLAGS_burning; i++) { for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots); predictor->Run(input_slots, &outputs_slots);
} }
Timer timer; Timer timer;
if (test_all_data) {
double sum = 0;
LOG(INFO) << "Total number of samples: " << data.datasets.size();
for (int i = 0; i < repeat; i++) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
}
PrintTime(batch_size, repeat, 1, 0, sum / repeat);
LOG(INFO) << "Average latency of each sample: "
<< sum / repeat / data.datasets.size() << " ms";
return;
}
timer.tic(); timer.tic();
for (int i = 0; i < repeat; i++) { for (int i = 0; i < repeat; i++) {
predictor->Run(input_slots, &outputs_slots); predictor->Run(input_slots, &outputs_slots);
} }
PrintTime(timer.toc(), batch_size, repeat); PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
// check result
EXPECT_EQ(outputs_slots.size(), 1UL); EXPECT_EQ(outputs_slots.size(), 1UL);
auto &out = outputs_slots[0]; auto &out = outputs_slots[0];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
...@@ -188,12 +214,60 @@ void TestLACPrediction(const std::string &model_path, ...@@ -188,12 +214,60 @@ void TestLACPrediction(const std::string &model_path,
for (size_t i = 0; i < batch1_size; ++i) { for (size_t i = 0; i < batch1_size; ++i) {
EXPECT_EQ(pdata[i], lac_ref_data[i]); EXPECT_EQ(pdata[i], lac_ref_data[i]);
} }
if (use_analysis) {
// run once for comparion as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], pdata[i]);
}
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
EXPECT_EQ(num_ops, 11);
}
} }
TEST(Analyzer_LAC, native) { TEST(Analyzer_LAC, native) {
LOG(INFO) << "LAC with native"; LOG(INFO) << "LAC with native";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data); FLAGS_repeat, FLAGS_test_all_data);
} }
TEST(Analyzer_LAC, analysis) {
LOG(INFO) << "LAC with analysis";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data, true);
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -13,18 +13,19 @@ ...@@ -13,18 +13,19 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path"); DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size."); DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -35,6 +36,7 @@ struct DataRecord { ...@@ -35,6 +36,7 @@ struct DataRecord {
std::vector<size_t> lod; // two inputs have the same lod info. std::vector<size_t> lod; // two inputs have the same lod info.
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
size_t num_samples; // total number of samples
DataRecord() = default; DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1) explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) { : batch_size(batch_size) {
...@@ -81,6 +83,7 @@ struct DataRecord { ...@@ -81,6 +83,7 @@ struct DataRecord {
word_data_all.push_back(std::move(word_data)); word_data_all.push_back(std::move(word_data));
mention_data_all.push_back(std::move(mention_data)); mention_data_all.push_back(std::move(mention_data));
} }
num_samples = num_lines;
} }
}; };
...@@ -109,7 +112,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -109,7 +112,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25}; 48, 39, 38, 16, 25};
void TestChineseNERPrediction() { void TestChineseNERPrediction(bool use_analysis) {
NativeConfig config; NativeConfig config;
config.prog_file = FLAGS_infer_model + "/__model__"; config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param"; config.param_file = FLAGS_infer_model + "/param";
...@@ -117,24 +120,53 @@ void TestChineseNERPrediction() { ...@@ -117,24 +120,53 @@ void TestChineseNERPrediction() {
config.device = 0; config.device = 0;
config.specify_input_name = true; config.specify_input_name = true;
auto predictor = std::vector<PaddleTensor> input_slots, outputs;
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); std::unique_ptr<PaddlePredictor> predictor;
std::vector<PaddleTensor> input_slots; Timer timer;
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); if (use_analysis) {
AnalysisConfig cfg;
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.param_file = FLAGS_infer_model + "/param";
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
double sum = 0;
size_t num_samples;
for (int i = 0; i < FLAGS_repeat; i++) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
num_samples = data.num_samples;
for (size_t bid = 0; bid < num_samples; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
timer.tic();
predictor->Run(input_slots, &outputs);
sum += timer.toc();
}
}
LOG(INFO) << "total number of samples: " << num_samples;
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
LOG(INFO) << "average latency of each sample: "
<< sum / FLAGS_repeat / num_samples;
return;
}
// Prepare inputs. // Prepare inputs.
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareInputs(&input_slots, &data, FLAGS_batch_size); PrepareInputs(&input_slots, &data, FLAGS_batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic(); timer.tic();
for (int i = 0; i < FLAGS_repeat; i++) { for (int i = 0; i < FLAGS_repeat; i++) {
predictor->Run(input_slots, &outputs); predictor->Run(input_slots, &outputs);
} }
LOG(INFO) << "===========profile result==========="; PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat);
LOG(INFO) << "batch_size: " << FLAGS_batch_size
<< ", repeat: " << FLAGS_repeat
<< ", latency: " << timer.toc() / FLAGS_repeat << "ms";
LOG(INFO) << "=====================================";
PADDLE_ENFORCE(outputs.size(), 1UL); PADDLE_ENFORCE(outputs.size(), 1UL);
auto &out = outputs[0]; auto &out = outputs[0];
...@@ -145,10 +177,51 @@ void TestChineseNERPrediction() { ...@@ -145,10 +177,51 @@ void TestChineseNERPrediction() {
for (size_t i = 0; i < std::min(11UL, size); i++) { for (size_t i = 0; i < std::min(11UL, size); i++) {
PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]); PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
} }
if (use_analysis) {
// run once for comparion as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], result[i]);
}
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
EXPECT_EQ(num_ops, 14);
}
} }
// Directly infer with the original model. TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); }
TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
...@@ -24,12 +25,12 @@ ...@@ -24,12 +25,12 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
DEFINE_int32(batch_size, 10, "batch size."); DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -220,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -220,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
switch (tensor.dtype) {
case PaddleDType::FLOAT32:
os << "float32";
break;
case PaddleDType::INT64:
os << "int64";
break;
default:
os << "unset";
}
os << '\n';
os << " - shape: " << to_string(tensor.shape) << '\n';
os << " - lod: ";
for (auto &l : tensor.lod) {
os << to_string(l) << "; ";
}
os << "\n";
os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
return os.str();
}
} // namespace } // namespace
const float ditu_rnn_target_data[] = { const float ditu_rnn_target_data[] = {
...@@ -266,55 +234,93 @@ const float ditu_rnn_target_data[] = { ...@@ -266,55 +234,93 @@ const float ditu_rnn_target_data[] = {
10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0, 10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0,
93.5771, 3.84641, 0, 0, 0, 0, 0, 0, 93.5771, 3.84641, 0, 0, 0, 0, 0, 0,
169.426, 0, 0, 0, 0, 0, 0, 0}; 169.426, 0, 0, 0, 0, 0, 0, 0};
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model. // Test with a really complicate model.
void TestDituRNNPrediction(const std::string &model_path, void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
const std::string &data_path, int batch_size, int num_threads) {
bool use_analysis, bool activate_ir, AnalysisConfig config;
int num_times = 1) {
NativeConfig config;
config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
config.use_gpu = false; config.use_gpu = false;
config.device = 0; config.device = 0;
config.specify_input_name = true; config.specify_input_name = true;
config.enable_ir_optim = activate_ir;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor = auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor = auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
DataRecord data(data_path, batch_size); DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
// Prepare inputs. // Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size); PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs; std::vector<PaddleTensor> outputs, base_outputs;
base_predictor->Run(input_slots, &base_outputs); base_predictor->Run(input_slots, &base_outputs);
Timer timer; if (num_threads == 1) {
timer.tic(); // Prepare inputs.
for (int i = 0; i < num_times; i++) { Timer timer;
predictor->Run(input_slots, &outputs); timer.tic();
} for (int i = 0; i < num_times; i++) {
LOG(INFO) << "===========profile result==========="; predictor->Run(input_slots, &outputs);
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times }
<< ", latency: " << timer.toc() / num_times << "ms"; PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
LOG(INFO) << "====================================="; CompareResult(outputs, base_outputs);
} else {
PADDLE_ENFORCE_GT(outputs.size(), 0); std::vector<std::thread> threads;
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); std::vector<std::unique_ptr<PaddlePredictor>> predictors;
for (size_t i = 0; i < outputs.size(); i++) { // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
auto &out = outputs[i]; // because AttentionLSTM's hard code nodeid will be damanged.
auto &base_out = base_outputs[i]; for (int tid = 0; tid < num_threads; ++tid) {
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, predictors.emplace_back(
[](int a, int b) { return a * b; }); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), config));
1, [](int a, int b) { return a * b; }); }
PADDLE_ENFORCE_EQ(size, size1); for (int tid = 0; tid < num_threads; ++tid) {
PADDLE_ENFORCE_GT(size, 0); threads.emplace_back([&, tid]() {
float *data = static_cast<float *>(out.data.data()); // Each thread should have local input_slots and outputs.
float *base_data = static_cast<float *>(base_out.data.data()); std::vector<PaddleTensor> input_slots;
for (size_t j = 0; j < size; j++) { DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
EXPECT_NEAR(data[j], base_data[j], 1e-3); PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
} }
} }
...@@ -345,25 +351,26 @@ void TestDituRNNPrediction(const std::string &model_path, ...@@ -345,25 +351,26 @@ void TestDituRNNPrediction(const std::string &model_path,
} }
} }
// Directly infer with the original model. // Inference with analysis and IR, easy for profiling independently.
TEST(Analyzer, DituRNN_without_analysis) { TEST(Analyzer, DituRNN) {
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, TestDituRNNPrediction(true, true, FLAGS_num_threads);
FLAGS_batch_size, false, false, FLAGS_repeat);
} }
// Inference with the original model with the analysis turned on, the analysis // Other unit-tests of DituRNN, test different options of use_analysis,
// module will transform the program to a data flow graph. // activate_ir and multi-threads.
TEST(Analyzer, DituRNN_with_analysis) { TEST(Analyzer, DituRNN_tests) {
LOG(INFO) << "ditu rnn with analysis"; int num_threads[2] = {1, 4};
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, for (auto i : num_threads) {
FLAGS_batch_size, true, false, FLAGS_repeat); // Directly infer with the original model.
} TestDituRNNPrediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// Inference with analysis and IR. The IR module will fuse some large kernels. // analysis
TEST(Analyzer, DituRNN_with_analysis_with_IR) { // module will transform the program to a data flow graph.
LOG(INFO) << "ditu rnn with analysis and IR fuse"; TestDituRNNPrediction(true, false, i);
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, // Inference with analysis and IR. The IR module will fuse some large
FLAGS_batch_size, true, true, FLAGS_repeat); // kernels.
TestDituRNNPrediction(true, true, i);
}
} }
} // namespace analysis } // namespace analysis
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
DEFINE_int32(topn, -1, "Run top n batches of data to save time");
namespace paddle {
namespace inference {
struct DataReader {
explicit DataReader(const std::string &path)
: file(new std::ifstream(path)) {}
bool NextBatch(PaddleTensor *tensor, int batch_size) {
PADDLE_ENFORCE_EQ(batch_size, 1);
std::string line;
tensor->lod.clear();
tensor->lod.emplace_back(std::vector<size_t>({0}));
std::vector<int64_t> data;
for (int i = 0; i < batch_size; i++) {
if (!std::getline(*file, line)) return false;
inference::split_to_int64(line, ' ', &data);
}
tensor->lod.front().push_back(data.size());
tensor->data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
tensor->shape.clear();
tensor->shape.push_back(data.size());
tensor->shape.push_back(1);
return true;
}
std::unique_ptr<std::ifstream> file;
};
void Main(int batch_size) {
// shape --
// Create Predictor --
AnalysisConfig config;
config.model_dir = FLAGS_infer_model;
config.use_gpu = false;
config.enable_ir_optim = true;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
auto &input = input_slots[0];
input.dtype = PaddleDType::INT64;
inference::Timer timer;
double sum = 0;
std::vector<PaddleTensor> output_slots;
int num_batches = 0;
for (int t = 0; t < FLAGS_repeat; t++) {
DataReader reader(FLAGS_infer_data);
while (reader.NextBatch(&input, FLAGS_batch_size)) {
if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
++num_batches;
}
}
PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
// Get output
LOG(INFO) << "get outputs " << output_slots.size();
for (auto &output : output_slots) {
LOG(INFO) << "output.shape: " << to_string(output.shape);
// no lod ?
CHECK_EQ(output.lod.size(), 0UL);
LOG(INFO) << "output.dtype: " << output.dtype;
std::stringstream ss;
for (int i = 0; i < 5; i++) {
ss << static_cast<float *>(output.data.data())[i] << " ";
}
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
}
}
TEST(text_classification, basic) { Main(FLAGS_batch_size); }
} // namespace inference
} // namespace paddle
...@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { ...@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
}; };
} // namespace } // namespace
Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_IA_graphviz_log_root, FLAGS_IA_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger")); "data_flow_graph_to_fluid_graphviz_debugger"));
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include <string> #include <string>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass { ...@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
return "Transform a DFG to a Fluid ProgramDesc"; return "Transform a DFG to a Fluid ProgramDesc";
} }
Pass *CreateGraphvizDebugerPass() const override; AnalysisPass *CreateGraphvizDebugerPass() const override;
protected: protected:
// Add a Fluid Op into the ProgramDesc. // Add a Fluid Op into the ProgramDesc.
......
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include <fstream> #include <fstream>
#include <string> #include <string>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(IA_graphviz_log_root);
DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
...@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { ...@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
}; };
} }
Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const { AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger")); FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
} }
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
#include <string> #include <string>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass { ...@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
return "transform a fluid ProgramDesc to a data flow graph."; return "transform a fluid ProgramDesc to a data flow graph.";
} }
Pass *CreateGraphvizDebugerPass() const override; AnalysisPass *CreateGraphvizDebugerPass() const override;
private: private:
framework::proto::ProgramDesc const *desc_; framework::proto::ProgramDesc const *desc_;
......
...@@ -14,14 +14,17 @@ ...@@ -14,14 +14,17 @@
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using namespace framework;
static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__"; static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
...@@ -47,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass { ...@@ -47,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
// Load program. // Load program.
auto program = LoadProgramDesc(*argument->fluid_model_program_path); auto program = LoadProgramDesc(*argument->fluid_model_program_path);
argument->origin_program_desc.reset(new proto::ProgramDesc(program)); argument->origin_program_desc.reset(
new framework::proto::ProgramDesc(program));
// Create main data flow graph. // Create main data flow graph.
if (!argument->main_dfg) { if (!argument->main_dfg) {
argument->main_dfg.reset(new DataFlowGraph); argument->main_dfg.reset(new DataFlowGraph);
...@@ -77,27 +81,30 @@ class FluidToIrPass final : public DataFlowGraphPass { ...@@ -77,27 +81,30 @@ class FluidToIrPass final : public DataFlowGraphPass {
IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"), IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
nullptr); nullptr);
// Pass the scope from analysis to IR if needed. // Pass the scope from analysis to IR if needed.
if (argument_->Has(ir::kParamScopeAttr)) { if (argument_->Has(framework::ir::kParamScopeAttr)) {
// Here the address is passed, attention that IR doesn't own the scope, so // Here the address is passed, attention that IR doesn't own the scope, so
// the real scope in analysis should live during the IR phase. // the real scope in analysis should live during the IR phase.
ir_passes.graph().Set( ir_passes.graph().Set(
ir::kParamScopeAttr, framework::ir::kParamScopeAttr,
new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr))); new framework::Scope *(&argument_->Get<framework::Scope>(
framework::ir::kParamScopeAttr)));
} }
const auto &ir_passes_to_apply = if (FLAGS_IA_enable_ir) {
argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr); const auto &ir_passes_to_apply =
ir_passes.Apply(ir_passes_to_apply); argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
ir_passes.Apply(ir_passes_to_apply);
}
PADDLE_ENFORCE(argument_->main_dfg.get()); PADDLE_ENFORCE(argument_->main_dfg.get());
argument_->main_dfg->Build(ir_passes.graph()); argument_->main_dfg->Build(ir_passes.graph());
// inherit the arguments from ir. // inherit the arguments from ir.
if (ir_passes.graph().Has(ir::kFuseStatisAttr)) { if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
argument_->Set( argument_->Set(
ir::kFuseStatisAttr, framework::ir::kFuseStatisAttr,
new std::unordered_map<std::string, int>( new std::unordered_map<std::string, int>(
ir_passes.graph().Get<std::unordered_map<std::string, int>>( ir_passes.graph().Get<std::unordered_map<std::string, int>>(
ir::kFuseStatisAttr))); framework::ir::kFuseStatisAttr)));
} }
} }
...@@ -109,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass { ...@@ -109,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass {
private: private:
// Load parameters from a single file or from a directory. // Load parameters from a single file or from a directory.
bool LoadParams(Scope *scope, const std::string &dir, bool LoadParams(framework::Scope *scope, const std::string &dir,
const std::string &prog_file, const std::string &param_file); const std::string &prog_file, const std::string &param_file);
private: private:
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() { ...@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() {
} }
} }
void NodePassManager::RunAll() {
PADDLE_ENFORCE(argument_);
PADDLE_ENFORCE(argument_->main_dfg.get());
auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
for (auto& node : trait) {
for (auto& pass : data_) {
pass->Run(&node);
}
}
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -33,7 +33,7 @@ limitations under the License. */ ...@@ -33,7 +33,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -43,7 +43,7 @@ namespace analysis { ...@@ -43,7 +43,7 @@ namespace analysis {
* PassManager is the base class for all pass managers, a pass manager has * PassManager is the base class for all pass managers, a pass manager has
* several Pass-es registered, and execute them in the linear order. * several Pass-es registered, and execute them in the linear order.
*/ */
class PassManager : public OrderedRegistry<Pass> { class PassManager : public OrderedRegistry<AnalysisPass> {
public: public:
PassManager() = default; PassManager() = default;
// Call all the passes' Initialize methods. The desc and data_flow_graph are // Call all the passes' Initialize methods. The desc and data_flow_graph are
...@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager { ...@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
virtual ~DfgPassManager() = default; virtual ~DfgPassManager() = default;
}; };
/*
* A pass manager that process a Node each time.
*/
class NodePassManager : public PassManager {
public:
NodePassManager() = default;
void RunAll() override;
virtual ~NodePassManager() = default;
};
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager { ...@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager {
std::string description() const override { return "test doc"; } std::string description() const override { return "test doc"; }
}; };
class TestNodePassManager final : public NodePassManager {
public:
virtual ~TestNodePassManager() = default;
std::string repr() const override { return "test-node-pass-manager"; }
std::string description() const override { return "test doc"; }
};
class TestNodePass final : public NodePass {
public:
virtual ~TestNodePass() = default;
bool Initialize(Argument* argument) override { return true; }
void Run(Node* node) override {
LOG(INFO) << "- Processing node " << node->repr();
}
std::string repr() const override { return "test-node"; }
std::string description() const override { return "some doc"; }
};
TEST(PassManager, DFG_pass_manager) { TEST(PassManager, DFG_pass_manager) {
TestDfgPassManager manager; TestDfgPassManager manager;
DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
...@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) { ...@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) {
manager.RunAll(); manager.RunAll();
} }
TEST(PassManager, Node_pass_manager) {
Argument argument(FLAGS_inference_model_dir);
// Pre-process: initialize the DFG with the ProgramDesc first.
FluidToDataFlowGraphPass pass0;
pass0.Initialize(&argument);
pass0.Run(argument.main_dfg.get());
TestNodePassManager manager;
manager.Register("test-node-pass", new TestNodePass);
ASSERT_TRUE(manager.Initialize(&argument));
manager.RunAll();
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass { ...@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
} }
}; };
Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const { AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root, DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
"tensorrt_marked_node"); "tensorrt_marked_node");
return new DfgDebuggerPass(config); return new DfgDebuggerPass(config);
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h" #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
namespace paddle { namespace paddle {
...@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass { ...@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
return "tensorrt sub-graph mark pass"; return "tensorrt sub-graph mark pass";
} }
Pass* CreateGraphvizDebugerPass() const override; AnalysisPass* CreateGraphvizDebugerPass() const override;
bool Finalize() override; bool Finalize() override;
private: private:
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/node.h" #include "paddle/fluid/inference/analysis/node.h"
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h" #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
namespace paddle { namespace paddle {
......
...@@ -44,8 +44,7 @@ function(inference_api_test TARGET_NAME) ...@@ -44,8 +44,7 @@ function(inference_api_test TARGET_NAME)
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
...@@ -61,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine ...@@ -61,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
endif() endif()
if (WITH_ANAKIN AND WITH_GPU) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
...@@ -71,12 +70,24 @@ if (WITH_ANAKIN AND WITH_GPU) # only needed in CI ...@@ -71,12 +70,24 @@ if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
anakin_target(inference_anakin_api) anakin_target(inference_anakin_api)
anakin_target(inference_anakin_api_shared) anakin_target(inference_anakin_api_shared)
if (WITH_TESTING) if (WITH_TESTING)
cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
DEPS inference_anakin_api_shared dynload_cuda SERIAL) set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
if(WITH_GPU)
set(anakin_test_extra_deps dynload_cuda)
set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
endif()
cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared dynload_cuda SERIAL) DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
endif(WITH_TESTING) endif(WITH_TESTING)
endif() endif()
...@@ -14,24 +14,40 @@ ...@@ -14,24 +14,40 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include <memory> #include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
namespace paddle { namespace paddle {
bool AnalysisPredictor::Init( bool AnalysisPredictor::Init(
const std::shared_ptr<framework::Scope>& parent_scope) { const std::shared_ptr<framework::Scope>& parent_scope) {
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
#if !defined(_WIN32)
if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'";
auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device);
}
#endif
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
LOG(WARNING) << "ir optimize only supports CPU currently";
config_.enable_ir_optim = false;
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
PADDLE_ENFORCE(!parent_scope);
if (parent_scope) { if (parent_scope) {
scope_ = parent_scope; scope_ = parent_scope;
sub_scope_ = &(parent_scope->NewScope()); sub_scope_ = &(parent_scope->NewScope());
...@@ -73,7 +89,7 @@ bool AnalysisPredictor::Init( ...@@ -73,7 +89,7 @@ bool AnalysisPredictor::Init(
void AnalysisPredictor::OptimizeInferenceProgram() { void AnalysisPredictor::OptimizeInferenceProgram() {
LOG(INFO) << "optimize begin"; LOG(INFO) << "optimize begin";
FLAGS_IA_enable_ir = true; FLAGS_IA_enable_ir = config_.enable_ir_optim;
FLAGS_IA_enable_tensorrt_subgraph_engine = false; FLAGS_IA_enable_tensorrt_subgraph_engine = false;
FLAGS_IA_output_storage_path = ""; // Don't output the model. FLAGS_IA_output_storage_path = ""; // Don't output the model.
// Analyze inference_program // Analyze inference_program
...@@ -90,24 +106,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -90,24 +106,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
argument_.origin_program_desc.reset( argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
Analyzer().Run(&argument_); PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
"Only kExclude is supported yet.");
Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
CHECK(argument_.transformed_program_desc); CHECK(argument_.transformed_program_desc);
VLOG(5) << "to prepare executor"; VLOG(5) << "to prepare executor";
// LOG(INFO) << "transformed_parogram_desc " <<
// argument.transformed_program_desc->DebugString();
inference_program_.reset( inference_program_.reset(
new framework::ProgramDesc(*argument_.transformed_program_desc)); new framework::ProgramDesc(*argument_.transformed_program_desc));
PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr)); if (argument_.Has(framework::ir::kParamScopeAttr)) {
// Update scope. // Update scope.
scope_.reset( scope_.reset(
argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr)); argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
LOG(INFO) << "optimize end =="; }
LOG(INFO) << "== optimize end ==";
} }
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
VLOG(3) << "create NativePredictor"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
...@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc; ...@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
*/ */
class AnalysisPredictor : public NativePaddlePredictor { class AnalysisPredictor : public NativePaddlePredictor {
public: public:
explicit AnalysisPredictor(const NativeConfig& config) explicit AnalysisPredictor(const AnalysisConfig& config)
: NativePaddlePredictor(config), config_(config) {} : NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope); bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
...@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor { ...@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
Argument& analysis_argument() { return argument_; } Argument& analysis_argument() { return argument_; }
private: private:
NativeConfig config_; AnalysisConfig config_;
Argument argument_; Argument argument_;
}; };
......
...@@ -193,7 +193,9 @@ PaddleInferenceAnakinPredictor<Target>::Clone() { ...@@ -193,7 +193,9 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
return std::move(cls); return std::move(cls);
} }
#ifdef PADDLE_WITH_CUDA
template class PaddleInferenceAnakinPredictor<anakin::NV>; template class PaddleInferenceAnakinPredictor<anakin::NV>;
#endif
template class PaddleInferenceAnakinPredictor<anakin::X86>; template class PaddleInferenceAnakinPredictor<anakin::X86>;
// A factory to help create difference predictor. // A factory to help create difference predictor.
...@@ -202,10 +204,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -202,10 +204,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) { AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create."; VLOG(3) << "Anakin Predictor create.";
if (config.target_type == AnakinConfig::NVGPU) { if (config.target_type == AnakinConfig::NVGPU) {
#ifdef PADDLE_WITH_CUDA
VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ]."; VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
std::unique_ptr<PaddlePredictor> x( std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor<anakin::NV>(config)); new PaddleInferenceAnakinPredictor<anakin::NV>(config));
return x; return x;
#else
LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
return nullptr;
#endif
} else if (config.target_type == AnakinConfig::X86) { } else if (config.target_type == AnakinConfig::X86) {
VLOG(3) << "Anakin Predictor create on [ Intel X86 ]."; VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
std::unique_ptr<PaddlePredictor> x( std::unique_ptr<PaddlePredictor> x(
......
...@@ -20,71 +20,16 @@ limitations under the License. */ ...@@ -20,71 +20,16 @@ limitations under the License. */
#include <iostream> #include <iostream>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "framework/core/net/net.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
#include "utils/logger/logger.h"
DEFINE_string(model, "", "Directory of the inference model."); DEFINE_string(model, "", "Directory of the inference model.");
DEFINE_string(datapath, "", "Path of the dataset."); DEFINE_string(datapath, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
std::vector<std::string> string_split(std::string in_str,
std::string delimiter) {
std::vector<std::string> seq;
int found = in_str.find(delimiter);
int pre_found = -1;
while (found != std::string::npos) {
if (pre_found == -1) {
seq.push_back(in_str.substr(0, found));
} else {
seq.push_back(in_str.substr(pre_found + delimiter.length(),
found - delimiter.length() - pre_found));
}
pre_found = found;
found = in_str.find(delimiter, pre_found + delimiter.length());
}
seq.push_back(
in_str.substr(pre_found + 1, in_str.length() - (pre_found + 1)));
return seq;
}
std::vector<std::string> string_split(
std::string in_str, std::vector<std::string>& delimiter) { // NOLINT
std::vector<std::string> in;
std::vector<std::string> out;
out.push_back(in_str);
for (auto del : delimiter) {
in = out;
out.clear();
for (auto s : in) {
auto out_s = string_split(s, del);
for (auto o : out_s) {
out.push_back(o);
}
}
}
return out;
}
class Data { class Data {
public: public:
Data(std::string file_name, int batch_size) Data(std::string file_name, int batch_size)
...@@ -120,36 +65,24 @@ void Data::get_batch_data( ...@@ -120,36 +65,24 @@ void Data::get_batch_data(
week_fea.clear(); week_fea.clear();
time_fea.clear(); time_fea.clear();
while (_file.getline(buf, 10000)) { while (_file.getline(buf, 10000)) {
std::string s = buf; std::vector<std::string> data_vec;
std::vector<std::string> deli_vec = {":"}; paddle::inference::split(buf, ':', &data_vec);
std::vector<std::string> data_vec = string_split(s, deli_vec);
std::vector<std::string> seq; std::vector<std::string> seq;
seq = string_split(data_vec[0], {"|"}); paddle::inference::split(data_vec[0], '|', &seq);
for (auto link : seq) { for (auto link : seq) {
std::vector<std::string> data = string_split(link, ",");
std::vector<float> vec; std::vector<float> vec;
for (int i = 0; i < data.size(); i++) { paddle::inference::split_to_float(link, ',', &vec);
vec.push_back(atof(data[i].c_str()));
}
fea.push_back(vec); fea.push_back(vec);
} }
std::vector<std::string> week_data;
std::vector<std::string> time_data;
week_data = string_split(data_vec[2], ",");
std::vector<float> vec_w; std::vector<float> vec_w;
for (int i = 0; i < week_data.size(); i++) { paddle::inference::split_to_float(data_vec[2], ',', &vec_w);
vec_w.push_back(atof(week_data[i].c_str()));
}
week_fea.push_back(vec_w); week_fea.push_back(vec_w);
time_data = string_split(data_vec[1], ",");
std::vector<float> vec_t; std::vector<float> vec_t;
for (int i = 0; i < time_data.size(); i++) { paddle::inference::split_to_float(data_vec[1], ',', &vec_t);
vec_t.push_back(atof(time_data[i].c_str()));
}
time_fea.push_back(vec_t); time_fea.push_back(vec_t);
cum += seq.size(); cum += seq.size();
...@@ -275,14 +208,13 @@ void single_test() { ...@@ -275,14 +208,13 @@ void single_test() {
inputs.push_back(tensor_2); inputs.push_back(tensor_2);
inputs.push_back(tensor_0); inputs.push_back(tensor_0);
Timer timer; paddle::inference::Timer timer;
timer.tic(); timer.tic();
for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs); for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);
LOG(INFO) << "batch_size = " << FLAGS_batch_size paddle::inference::PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0,
<< ", repeat = " << FLAGS_repeat timer.toc() / FLAGS_repeat);
<< ", sequence_length = " << seq_offset[seq_offset.size() - 1] LOG(INFO) << "sequence_length = " << seq_offset[seq_offset.size() - 1];
<< ", latency: " << timer.toc() / FLAGS_repeat << "ms";
float* data_o = static_cast<float*>(outputs[0].data.data()); float* data_o = static_cast<float*>(outputs[0].data.data());
VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length(); VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();
......
...@@ -176,7 +176,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -176,7 +176,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::Scope *scope) { framework::Scope *scope) {
VLOG(3) << "Predictor::set_feed"; VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feeds_.size()) { if (inputs.size() != feeds_.size()) {
LOG(ERROR) << "wrong feed input size."; LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
<< inputs.size();
return false; return false;
} }
for (size_t i = 0; i < inputs.size(); ++i) { for (size_t i = 0; i < inputs.size(); ++i) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <glog/logging.h>
#include <sys/time.h> #include <sys/time.h>
#include <algorithm> #include <algorithm>
#include <numeric> #include <numeric>
...@@ -88,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor, ...@@ -88,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
} }
} }
std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
switch (tensor.dtype) {
case PaddleDType::FLOAT32:
os << "float32";
break;
case PaddleDType::INT64:
os << "int64";
break;
default:
os << "unset";
}
os << '\n';
os << " - shape: " << to_string(tensor.shape) << '\n';
os << " - lod: ";
for (auto &l : tensor.lod) {
os << to_string(l) << "; ";
}
os << "\n";
os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
return os.str();
}
void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======";
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig { ...@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
int workspace_size{1 << 30}; int workspace_size{1 << 30};
}; };
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
//
enum class IrPassMode {
kSystem, // Use system default passes, not customize.
kInclude, // Specify the passes in `ir_passes`.
kExclude // Specify the disabled passes in `ir_passes`.
};
bool enable_ir_optim = true;
IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default.
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
};
// A factory to help create different predictors. // A factory to help create different predictors.
// //
// FOR EXTENSION DEVELOPER: // FOR EXTENSION DEVELOPER:
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/auc_op.h" #include "paddle/fluid/operators/auc_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(predict_height, label_height, PADDLE_ENFORCE_EQ(predict_height, label_height,
"Out and Label should have same height."); "Out and Label should have same height.");
int num_thres = ctx->Attrs().Get<int>("num_thresholds"); int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
ctx->SetOutputDim("AUC", {1}); ctx->SetOutputDim("AUC", {1});
ctx->SetOutputDim("TPOut", {num_thres}); ctx->SetOutputDim("BatchAUC", {1});
ctx->SetOutputDim("TNOut", {num_thres}); ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
ctx->SetOutputDim("FPOut", {num_thres}); ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
ctx->SetOutputDim("FNOut", {num_thres});
ctx->ShareLoD("Predict", /*->*/ "AUC");
} }
protected: protected:
...@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", AddInput("Label",
"A 2D int tensor indicating the label of the training data. " "A 2D int tensor indicating the label of the training data. "
"shape: [batch_size, 1]"); "shape: [batch_size, 1]");
AddInput("TP", "True-Positive value.");
AddInput("FP", "False-Positive value.");
AddInput("TN", "True-Negative value.");
AddInput("FN", "False-Negative value.");
// TODO(typhoonzero): support weight input // TODO(typhoonzero): support weight input
AddInput("StatPos", "Statistic value when label = 1");
AddInput("StatNeg", "Statistic value when label = 0");
AddOutput("AUC", AddOutput("AUC",
"A scalar representing the " "A scalar representing the "
"current area-under-the-curve."); "current area-under-the-curve.");
AddOutput("TPOut", "True-Positive value."); AddOutput("BatchAUC", "The AUC for current batch");
AddOutput("FPOut", "False-Positive value."); AddOutput("StatPosOut", "Statistic value when label = 1");
AddOutput("TNOut", "True-Negative value."); AddOutput("StatNegOut", "Statistic value when label = 0");
AddOutput("FNOut", "False-Negative value.");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.") AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC"); .SetDefault("ROC");
AddAttr<int>("num_thresholds", AddAttr<int>("num_thresholds",
"The number of thresholds to use when discretizing the" "The number of thresholds to use when discretizing the"
" roc curve.") " roc curve.")
.SetDefault(200); .SetDefault((2 << 12) - 1);
AddComment(R"DOC( AddComment(R"DOC(
Area Under The Curve (AUC) Operator. Area Under The Curve (AUC) Operator.
......
...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -23,106 +23,85 @@ namespace operators { ...@@ -23,106 +23,85 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AucKernel : public framework::OpKernel<T> { class AucKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* predict = ctx.Input<Tensor>("Predict"); auto *predict = ctx.Input<Tensor>("Predict");
auto* label = ctx.Input<Tensor>("Label"); auto *label = ctx.Input<Tensor>("Label");
auto* auc = ctx.Output<Tensor>("AUC");
std::string curve = ctx.Attr<std::string>("curve");
int num_thresholds = ctx.Attr<int>("num_thresholds");
int num_pred_buckets = num_thresholds + 1;
// Only use output var for now, make sure it's persistable and // Only use output var for now, make sure it's persistable and
// not cleaned up for each batch. // not cleaned up for each batch.
auto* true_positive = ctx.Output<Tensor>("TPOut"); auto *auc = ctx.Output<Tensor>("AUC");
auto* false_positive = ctx.Output<Tensor>("FPOut"); auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
auto* true_negative = ctx.Output<Tensor>("TNOut"); auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
auto* false_negative = ctx.Output<Tensor>("FNOut");
auto* auc_data = auc->mutable_data<double>(ctx.GetPlace()); auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
auc);
std::string curve = ctx.Attr<std::string>("curve"); auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
int num_thresholds = ctx.Attr<int>("num_thresholds"); std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
std::vector<double> thresholds_list; std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
thresholds_list.reserve(num_thresholds); calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
for (int i = 1; i < num_thresholds - 1; i++) { num_thresholds, batch_auc);
thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1); }
}
const double kEpsilon = 1e-7;
thresholds_list[0] = 0.0f - kEpsilon;
thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
private:
inline static double trapezoidArea(double X1, double X2, double Y1,
double Y2) {
return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
}
inline static void calcAuc(const framework::ExecutionContext &ctx,
const framework::Tensor *label,
const framework::Tensor *predict,
int64_t *stat_pos, int64_t *stat_neg,
int num_thresholds,
framework::Tensor *auc_tensor) {
size_t batch_size = predict->dims()[0]; size_t batch_size = predict->dims()[0];
size_t inference_width = predict->dims()[1]; size_t inference_width = predict->dims()[1];
const T *inference_data = predict->data<T>();
const auto *label_data = label->data<int64_t>();
auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
const T* inference_data = predict->data<T>(); for (size_t i = 0; i < batch_size; i++) {
const auto* label_data = label->data<int64_t>(); uint32_t binIdx = static_cast<uint32_t>(
inference_data[i * inference_width + 1] * num_thresholds);
auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace()); if (label_data[i]) {
auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace()); stat_pos[binIdx] += 1.0;
auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace()); } else {
auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace()); stat_neg[binIdx] += 1.0;
for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
// calculate TP, FN, TN, FP for current thresh
int64_t tp = 0, fn = 0, tn = 0, fp = 0;
for (size_t i = 0; i < batch_size; i++) {
// NOTE: label_data used as bool, labels > 0 will be treated as true.
if (label_data[i]) {
if (inference_data[i * inference_width + 1] >=
(thresholds_list[idx_thresh])) {
tp++;
} else {
fn++;
}
} else {
if (inference_data[i * inference_width + 1] >=
(thresholds_list[idx_thresh])) {
fp++;
} else {
tn++;
}
}
} }
// store rates
tp_data[idx_thresh] += tp;
fn_data[idx_thresh] += fn;
tn_data[idx_thresh] += tn;
fp_data[idx_thresh] += fp;
} }
// epsilon to avoid divide by zero.
double epsilon = 1e-6; *auc = 0.0f;
// Riemann sum to caculate auc.
Tensor tp_rate, fp_rate, rec_rate; double totPos = 0.0;
tp_rate.Resize({num_thresholds}); double totNeg = 0.0;
fp_rate.Resize({num_thresholds}); double totPosPrev = 0.0;
rec_rate.Resize({num_thresholds}); double totNegPrev = 0.0;
auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace()); int idx = num_thresholds;
auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
for (int i = 0; i < num_thresholds; i++) { while (idx >= 0) {
tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) / totPosPrev = totPos;
(tp_data[i] + fn_data[i] + epsilon); totNegPrev = totNeg;
fp_rate_data[i] = totPos += stat_pos[idx];
static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon); totNeg += stat_neg[idx];
rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) / *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
(tp_data[i] + fp_data[i] + epsilon);
--idx;
} }
*auc_data = 0.0f;
if (curve == "ROC") { if (totPos > 0.0 && totNeg > 0.0) {
for (int i = 0; i < num_thresholds - 1; i++) { *auc = *auc / totPos / totNeg;
auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
*auc_data = *auc_data + dx * y;
}
} else if (curve == "PR") {
for (int i = 1; i < num_thresholds; i++) {
auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
*auc_data = *auc_data + dx * y;
}
} }
} }
}; };
......
...@@ -119,7 +119,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -119,7 +119,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& last_scale, const framework::Tensor& last_scale,
const framework::Tensor& iter, const int window_size, const framework::Tensor& iter, const int window_size,
framework::Tensor* scales_arr, framework::Tensor* out_scale) { framework::Tensor* scales_arr, framework::Tensor* out_scale) {
auto& gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
T* scale_arr = scales_arr->mutable_data<T>(gpu_place); T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place); T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
......
...@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase { ...@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
} }
}; };
// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
// the XShape is used to carry the shape and lod of X which will be used in
// flatten_grad, in this way, the framework can reuse the memory of X
// immediately the flatten2_op is finished.
// Considering compatibility issues, we could not fix flatten2_op
class Flatten2OpInferShape : public FlattenOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
FlattenOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output (XShape) of Flatten op should not be null.");
const auto &in_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(in_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
xshape_dims[i + 1] = in_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", "XShape");
}
};
class Flatten2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axis = Attr<int>("axis");
auto in_dims =
scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
framework::AttributeMap attrs;
attrs["shape"] = out_dims;
attrs["inplace"] = false;
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Flatten2OpMaker : public FlattenOpMaker {
public:
void Make() override {
FlattenOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp.")
.AsIntermediate();
}
};
class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("flatten2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Flatten2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Flatten2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
attrs["inplace"] = false;
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, ...@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
ops::FlattenOpInferShape, ops::FlattenOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape); REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
ops::Flatten2GradInferShape);
...@@ -30,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -30,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"Input(WeightX) of GRU should not be null."); "Input(WeightX) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasInput("WeightH"), PADDLE_ENFORCE(ctx->HasInput("WeightH"),
"Input(WeightH) of GRU should not be null."); "Input(WeightH) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
"Output(BatchedOut) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"), PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of GRU should not be null."); "Output(Hidden) of GRU should not be null.");
...@@ -80,15 +73,20 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -80,15 +73,20 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
} }
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims);
ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Hidden");
int xx_width; int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) { if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1]; xx_width = wx_dims[1];
} else { } else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
"Output(BatchedOut) of GRU should not be null.");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims);
} }
ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX"); ctx->ShareLoD("X", "XX");
......
...@@ -38,16 +38,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -38,16 +38,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
"Output(Hidden) of LSTM should not be null."); "Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"), PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null."); "Output(Cell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
"Output(BatchedHidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
"Output(BatchedCell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
"Output(ReorderedC0) of LSTM should not be null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
...@@ -88,28 +78,36 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -88,28 +78,36 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1, PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1."); "The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"), b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
"Do not support peephole yet."); "The second dimension of Input(Bias) should be "
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, "7 * %d if enable peepholes connection or"
"The second dimension of Input(Bias) should be " "4 * %d if disable peepholes",
"4 * %d if disable peepholes connection", frame_size, frame_size);
frame_size);
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("Cell", out_dims); ctx->SetOutputDim("Cell", out_dims);
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedHidden", out_dims);
ctx->SetOutputDim("BatchedCell", out_dims);
ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("X", "Cell"); ctx->ShareLoD("X", "Cell");
int xx_width; int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) { if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1]; xx_width = wx_dims[1];
} else { } else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
"Output(BatchedHidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
"Output(BatchedCell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
"Output(ReorderedC0) of LSTM should not be null.");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedHidden", out_dims);
ctx->SetOutputDim("BatchedCell", out_dims);
} }
ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX"); ctx->ShareLoD("X", "XX");
...@@ -242,7 +240,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -242,7 +240,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
auto* xx = ctx.Output<LoDTensor>("XX"); \ auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \ auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \ auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); bool is_reverse = ctx.Attr<bool>("is_reverse"); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes");
#define INIT_BASE_SIZES \ #define INIT_BASE_SIZES \
auto x_dims = x->dims(); /* T x M*/ \ auto x_dims = x->dims(); /* T x M*/ \
...@@ -253,99 +252,165 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -253,99 +252,165 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const int D3 = D * 3; \ const int D3 = D * 3; \
const int D4 = wh_dims[1]; const int D4 = wh_dims[1];
#define INIT_BASE_INPUT_DATAS \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/ \
const T* wc_data = bias->data<T>() + D4; \
/* for peephole only*/ \
Tensor checked_cell; \
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
}
/// Compute LSTM
#define GEMM_WH_ADDON(bs, prev, out) \
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
wh_data, D4, static_cast<T>(1), out, D4)
// gates: W_ch, W_ih, W_fh, W_oh
#define GET_Ct(ct_1, gates, ct) \
/* C_t = C_t-1 * fgated + cand_gated * igated*/ \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, gates + D); \
blas.VMUL(D, ct_1, gates + D2, gates + D2); \
blas.VADD(D, gates + D, gates + D2, ct)
#define GET_Ht(ct, gates, ht) \
/* H_t = act_cell(C_t) * ogated */ \
act_cell(D, ct, gates + D2); \
blas.VMUL(D, gates + D2, gates + D3, ht)
#define GET_Ct_NOH0C0(gates, ct) \
/* C_t = igated * cgated*/ \
act_gate(D, gates + D, gates + D); \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, ct)
#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
/* get outgated, put W_oc * C_t on igated */ \
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
act_gate(D3, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \
/* get fgated and igated*/ \
blas.VMUL(D, wc_data, ct_1, checked_cell_data); \
blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
act_gate(D2, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
/* get ogated*/ \
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
void SeqCompute(const framework::ExecutionContext& ctx) const { void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext; using DeviceContext = paddle::platform::CPUDeviceContext;
INIT_BASE_INPUT_OUTPUT INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES INIT_BASE_SIZES
INIT_VEC_FUNC INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
auto x_lod = x->lod(); auto x_lod = x->lod();
const int total_T = x_dims[0]; const int total_T = x_dims[0];
const int N = x_lod[0].size() - 1; // batch size const int N = x_lod[0].size() - 1;
const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : nullptr; const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* c0_data = c0 ? c0->data<T>() : nullptr; const T* c0_data = c0 ? c0->data<T>() : nullptr;
const T* wx_data = wx->data<T>(); T* xx_data = xx->mutable_data<T>(place);
const T* wh_data = wh->data<T>(); T* h_out_data = hidden_out->mutable_data<T>(place);
T* xx_data = xx->mutable_data<T>(ctx.GetPlace()); T* c_out_data = cell_out->mutable_data<T>(place);
T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data, math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
xx_data, bias->data<T>()); xx_data, bias->data<T>());
int xx_offset = D4; int xx_offset = D4;
int gate_offset = D; int gate_offset = D;
if (is_reverse) { if (is_reverse) {
const int offset = (total_T - 1) * D; const int offset = (total_T - 1) * D;
xx_data = xx_data + offset * 4; xx_data = xx_data + offset * 4;
hidden_out_data = hidden_out_data + offset; h_out_data = h_out_data + offset;
cell_out_data = cell_out_data + offset; c_out_data = c_out_data + offset;
xx_offset = -D4; xx_offset = -D4;
gate_offset = -D; gate_offset = -D;
} }
auto move_step = [&]() { #define MOVE_ONE_STEP \
xx_data = xx_data + xx_offset; prev_h_data = h_out_data; \
hidden_out_data = hidden_out_data + gate_offset; prev_c_data = c_out_data; \
cell_out_data = cell_out_data + gate_offset; xx_data = xx_data + xx_offset; \
}; h_out_data = h_out_data + gate_offset; \
c_out_data = c_out_data + gate_offset
for (int i = 0; i < N; ++i) {
int bid = is_reverse ? N - 1 - i : i; #define PROCESS_H0C0_DEFINES \
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; int bid = is_reverse ? N - 1 - i : i; \
const T* prev_c_data = nullptr; int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
const T* prev_h_data = nullptr; const T* prev_c_data = nullptr; \
int tstart = 0; const T* prev_h_data = nullptr; \
if (h0_data) { int tstart = 0
prev_h_data = h0_data + bid * D;
prev_c_data = c0_data + bid * D; #define PROCESS_H0C0_PEEPHOLE \
} else { PROCESS_H0C0_DEFINES; \
// W_ch, W_ih, W_fh, W_oh if (h0_data) { \
act_gate(D3, xx_data + D, xx_data + D); prev_h_data = h0_data + bid * D; \
act_cand(D, xx_data, xx_data); prev_c_data = c0_data + bid * D; \
// cell out= input*tilde } else { \
blas.VMUL(D, xx_data, xx_data + D, cell_out_data); COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
// hidden out= act_state(cellout) * outgate MOVE_ONE_STEP; \
act_cell(D, cell_out_data, xx_data + D2); tstart = 1; \
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); }
// prev
prev_h_data = hidden_out_data;
prev_c_data = cell_out_data;
tstart = 1;
move_step();
}
for (int step = tstart; step < seq_len; ++step) {
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data);
// a = forget * prev_cell
blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
// b = input * tilde
blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
// cell out= a+b
blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
// hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2);
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev #define PROCESS_H0C0 \
prev_h_data = hidden_out_data; PROCESS_H0C0_DEFINES; \
prev_c_data = cell_out_data; if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
move_step(); if (use_peepholes) {
for (int i = 0; i < N; ++i) {
PROCESS_H0C0_PEEPHOLE
for (int step = tstart; step < seq_len; ++step) {
GEMM_WH_ADDON(1, prev_h_data, xx_data);
COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
MOVE_ONE_STEP;
}
}
} else {
for (int i = 0; i < N; ++i) {
PROCESS_H0C0
for (int step = tstart; step < seq_len; ++step) {
GEMM_WH_ADDON(1, prev_h_data, xx_data);
COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
MOVE_ONE_STEP;
}
} }
} }
#undef PROCESS_H0C0_DEFINES
#undef PROCESS_H0C0_PEEPHOLE
#undef PROCESS_H0C0
#undef MOVE_ONE_STEP
} }
void BatchCompute(const framework::ExecutionContext& ctx) const { void BatchCompute(const framework::ExecutionContext& ctx) const {
...@@ -357,17 +422,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -357,17 +422,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
} }
INIT_BASE_SIZES INIT_BASE_SIZES
INIT_VEC_FUNC INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0"); auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0"); auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
auto* batched_input = ctx.Output<LoDTensor>("BatchedInput"); auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell"); auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden"); auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>();
auto place = ctx.GetPlace();
T* xx_data = xx->mutable_data<T>(place); T* xx_data = xx->mutable_data<T>(place);
T* batched_input_data = batched_input->mutable_data<T>(place); T* batched_input_data = batched_input->mutable_data<T>(place);
T* batched_c_out_data = batched_c_out->mutable_data<T>(place); T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
...@@ -419,17 +480,14 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -419,17 +480,14 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
T* cur_in_data = batched_input_data; T* cur_in_data = batched_input_data;
T* cur_h_out_data = batched_h_out_data; T* cur_h_out_data = batched_h_out_data;
T* cur_c_out_data = batched_c_out_data; T* cur_c_out_data = batched_c_out_data;
// W_ch, W_ih, W_fh, W_oh
for (int i = 0; i < max_bs; ++i) { for (int i = 0; i < max_bs; ++i) {
act_gate(D3, cur_in_data + D, cur_in_data + D); GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
act_cand(D, cur_in_data, cur_in_data); if (use_peepholes) {
// cell out= input*tilde blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data); blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
// hidden out= act_state(cellout) * outgate }
act_cell(D, cur_c_out_data, cur_in_data + D2); act_gate(D, cur_in_data + D3, cur_in_data + D3);
blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
// add offset
cur_in_data += D4; cur_in_data += D4;
cur_c_out_data += D; cur_c_out_data += D;
cur_h_out_data += D; cur_h_out_data += D;
...@@ -438,49 +496,60 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -438,49 +496,60 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_h_data = batched_h_out_data; prev_h_data = batched_h_out_data;
prev_c_data = batched_c_out_data; prev_c_data = batched_c_out_data;
} }
// Then start from next
const auto& batch_starts = batched_lod[0]; const auto& batch_starts = batched_lod[0];
const int max_seq_len = batch_starts.size() - 1; const int max_seq_len = batch_starts.size() - 1;
const int offset = tstart * max_bs * D; const int offset = tstart * max_bs * D;
batched_input_data = batched_input_data + offset * 4; batched_input_data = batched_input_data + offset * 4;
batched_h_out_data = batched_h_out_data + offset; batched_h_out_data = batched_h_out_data + offset;
batched_c_out_data = batched_c_out_data + offset; batched_c_out_data = batched_c_out_data + offset;
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
prev_h_data, D, wh_data, D4, static_cast<T>(1),
batched_input_data, D4);
T* cur_in_data = batched_input_data; #define DEFINE_CUR \
T* cur_prev_c_data = prev_c_data; T* cur_in_data = batched_input_data; \
T* cur_c_out_data = batched_c_out_data; T* cur_prev_c_data = prev_c_data; \
T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = batched_c_out_data; \
for (int i = 0; i < cur_bs; ++i) { T* cur_h_out_data = batched_h_out_data
// W_ch, W_ih, W_fh, W_oh
act_gate(D3, cur_in_data + D, cur_in_data + D); #define MOVE_ONE_BATCH \
act_cand(D, cur_in_data, cur_in_data); cur_in_data += D4; \
// a = forget * prev_cell cur_prev_c_data += D; \
blas.VMUL(D, cur_in_data + D2, cur_prev_c_data, cur_in_data + D2); cur_c_out_data += D; \
// b = input * tilde cur_h_out_data += D
blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
// cell out= a+b #define MOVE_ONE_STEP \
blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data); prev_c_data = batched_c_out_data; \
// hidden out= act_state(cellout) * outgate prev_h_data = batched_h_out_data; \
act_cell(D, cur_c_out_data, cur_in_data + D2); batched_c_out_data = cur_c_out_data; \
blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); batched_h_out_data = cur_h_out_data; \
batched_input_data = cur_in_data
cur_in_data += D4;
cur_prev_c_data += D; if (use_peepholes) {
cur_c_out_data += D; for (int step = tstart; step < max_seq_len; ++step) {
cur_h_out_data += D; const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR;
for (int i = 0; i < cur_bs; ++i) {
COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data);
MOVE_ONE_BATCH;
}
MOVE_ONE_STEP;
}
} else {
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR;
for (int i = 0; i < cur_bs; ++i) {
COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data);
MOVE_ONE_BATCH;
}
MOVE_ONE_STEP;
} }
prev_c_data = batched_c_out_data;
prev_h_data = batched_h_out_data;
batched_c_out_data = cur_c_out_data;
batched_h_out_data = cur_h_out_data;
batched_input_data = cur_in_data;
} }
#undef MOVE_ONE_STEP
#undef MOVE_ONE_BATCH
#undef DEFINE_CUR
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq; math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batched_h_out->set_lod(batched_lod); batched_h_out->set_lod(batched_lod);
...@@ -496,6 +565,16 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -496,6 +565,16 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
BatchCompute(ctx); BatchCompute(ctx);
} }
} }
#undef COMPUTE_CtHt_PEEPHOLE
#undef COMPUTE_CtHt
#undef GET_Ct_NOH0C0
#undef COMPUTE_CtHt_NOH0C0
#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
#undef GET_Ht
#undef GET_Ct
#undef GEMM_WH_ADDON
#undef INIT_BASE_INPUT_DATAS
#undef INIT_BASE_SIZES #undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT #undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC #undef INIT_VEC_FUNC
......
...@@ -67,27 +67,27 @@ template <typename T, int BlockDim> ...@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
__global__ void LayerNormForward(const T *x, const T *scale, const T *bias, __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
T *y, T *mean, T *var, float epsilon, T *y, T *mean, T *var, float epsilon,
int feature_size) { int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>; using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = blockIdx.x * feature_size + threadIdx.x; int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size; int end_idx = (blockIdx.x + 1) * feature_size;
// Step 1: Reduce to calculate mean and var // Step 1: Reduce to calculate mean and var
T mean_val = static_cast<T>(0); double mean_val = 0;
T var_val = static_cast<T>(0); double var_val = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
T tmp = x[i]; T tmp = x[i];
mean_val += tmp; mean_val += tmp;
var_val += (tmp * tmp); var_val += (tmp * tmp);
} }
auto pair = BlockReduce(temp_storage) auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(mean_val, var_val), .Reduce(PairForLayerNorm<double>(mean_val, var_val),
PairForLayerNormAddFunctor<T>()); PairForLayerNormAddFunctor<double>());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
auto tmp = pair.first_ / feature_size; auto tmp = pair.first_ / feature_size;
mean[blockIdx.x] = tmp; mean[blockIdx.x] = static_cast<T>(tmp);
var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp; var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
} }
__syncthreads(); __syncthreads();
mean_val = mean[blockIdx.x]; mean_val = mean[blockIdx.x];
......
...@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
memset(output + i * row_width, 0, row_width * sizeof(T)); memset(output + i * row_width, 0, row_width * sizeof(T));
} else { } else {
PADDLE_ENFORCE_LT(ids[i], row_number); PADDLE_ENFORCE_LT(ids[i], row_number);
PADDLE_ENFORCE_GE(ids[i], 0); PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
memcpy(output + i * row_width, table + ids[i] * row_width, memcpy(output + i * row_width, table + ids[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
......
...@@ -246,6 +246,88 @@ class ReshapeGradKernel { ...@@ -246,6 +246,88 @@ class ReshapeGradKernel {
} }
}; };
// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
// the XShape is used to carry the shape and lod of X which will be used in
// reshape_grad, in this way, the framework can reuse the memory of X
// immediately the reshape_op is finished.
// Considering compatibility issues, we could not fix reshape_op
class Reshape2Op : public ReshapeOp {
public:
Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: ReshapeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
ReshapeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of ReshapeOp should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Reshape2OpMaker : public ReshapeOpMaker {
public:
void Make() override {
ReshapeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp.")
.AsIntermediate();
}
};
class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("reshape2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Reshape2GradOp : public framework::OperatorWithKernel {
public:
Reshape2GradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = ctx->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("XShape", framework::GradVarName("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
->type()),
ctx.device_context());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
...@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ...@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel); ops::ReshapeGradKernel);
REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
ops::Reshape2GradMaker);
REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel, ops::ReshapeKernel, int, ops::ReshapeKernel,
...@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ...@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int, double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel); ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
#endif #endif
...@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(param_out) of RmspropOp should not be null."); "Output(param_out) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
"Output(Momentum_out) of RmspropOp should not be null."); "Output(MomentOut) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
"Output(MeanSquareOut) of RmspropOp should not be null."); "Output(MeanSquareOut) of RmspropOp should not be null.");
if (ctx->Attrs().Get<bool>("centered")) {
PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
"Output(MeanGradOut) of RmspropOp should not be null.");
}
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim);
ctx->SetOutputDim("MeanSquareOut", param_dim); ctx->SetOutputDim("MeanSquareOut", param_dim);
if (ctx->Attrs().Get<bool>("centered")) {
ctx->SetOutputDim("MeanGradOut", param_dim);
}
} }
}; };
...@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("MeanSquare", AddInput("MeanSquare",
"(Tensor, default Tensor<float>)" "(Tensor, default Tensor<float>)"
" The mean square value that gets updated."); " The mean square value that gets updated.");
AddInput("MeanGrad",
"(Tensor, default Tensor<float>)"
" The moving average of gradient")
.AsDispensable();
AddInput("LearningRate", AddInput("LearningRate",
"(Tensor, default Tensor<float>) " "(Tensor, default Tensor<float>) "
"The learning rate should be a tensor of size 1."); "The learning rate should be a tensor of size 1.");
...@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("ParamOut", "(Tensor) Output updated parameter value."); AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
AddOutput("MomentOut", "(Tensor) Output updated moment."); AddOutput("MomentOut", "(Tensor) Output updated moment.");
AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
AddOutput("MeanGradOut",
"(Tensor) Output moving average of gradient updated value.");
AddAttr<float>("epsilon", AddAttr<float>("epsilon",
"(float, default 1e-10) Constant " "(float, default 1e-10) Constant "
...@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0.9f); .SetDefault(0.9f);
AddAttr<float>("momentum", "(float, default 0.0) Constant value.") AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Rmsprop Optimizer. Rmsprop Optimizer.
...@@ -103,6 +118,14 @@ MomentOut = momentum * Moment + ...@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
ParamOut = Param - MomentOut ParamOut = Param - MomentOut
$$ $$
if centered is true:
mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
mom = momentum * mom{t-1} + learning_rate * g_t /
sqrt(mean_square - mean_grad**2 + epsilon)
param -= mom
The original slides that proposed Rmsprop: Slide 29 of The original slides that proposed Rmsprop: Slide 29 of
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
......
...@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
float epsilon = ctx.Attr<float>("epsilon"); float epsilon = ctx.Attr<float>("epsilon");
float rho = ctx.Attr<float>("decay"); float rho = ctx.Attr<float>("decay");
float momentum = ctx.Attr<float>("momentum"); float momentum = ctx.Attr<float>("momentum");
bool centered = ctx.Attr<bool>("centered");
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare")); auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
...@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto ms_out = EigenVector<T>::Flatten(*mean_square_out); auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel()); Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
ms_out.device(place) = rho * ms + (1 - rho) * g * g; ms_out.device(place) = rho * ms + (1 - rho) * g * g;
mom_out.device(place) = if (centered) {
momentum * mom + auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
mean_grad_out->mutable_data<T>(ctx.GetPlace());
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) = momentum * mom +
lr.broadcast(grad_dsize) * g /
(ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom +
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out; p_out.device(place) = p - mom_out;
} }
}; };
......
...@@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault({}); .SetDefault({});
AddComment(R"DOC( AddComment(R"DOC(
Squeeze Operator. Squeeze Operator.
Remove single-dimensional entries from the shape of a tensor. Remove single-dimensional entries from the shape of a tensor.
Takes a parameter axes with a list of axes to squeeze. Takes a parameter axes with a list of axes to squeeze.
If axes is not provided, all the single dimensions will be removed from the shape. If axes is not provided, all the single dimensions will be removed from the shape.
If an axis is selected with shape entry not equal to one, an error is raised. If an axis is selected with shape entry not equal to one, an error is raised.
Examples: Examples:
Case 1: Case 1:
Given Given
X.shape = (1, 3, 1, 5) X.shape = (1, 3, 1, 5)
and and
axes = [0] axes = [0]
...@@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
Case 2: Case 2:
Given Given
X.shape = (1, 3, 1, 5) X.shape = (1, 3, 1, 5)
and and
axes = [] axes = []
we get: we get:
Out.shape = (3, 5) Out.shape = (3, 5)
...@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase { ...@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
} }
}; };
// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
// the XShape is used to carry the shape and lod of X which will be used in
// squeeze_grad, in this way, the framework can reuse the memory of X
// immediately the squeeze2_op is finished.
// Considering compatibility issues, we could not fix squeeze2_op
class Squeeze2OpMaker : public SqueezeOpMaker {
public:
void Make() override {
SqueezeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in SqueezeGradOp.")
.AsIntermediate();
}
};
class Squeeze2OpInferShape : public SqueezeOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
SqueezeOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of Squeeze operator should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Squeeze2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("squeeze2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Squeeze2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Squeeze2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker, ...@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
ops::SqueezeOpInferShape, ops::SqueezeOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape); REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
ops::Squeeze2GradInferShape);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include <string>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel { ...@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
...@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works. ...@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works.
2 &5 2 &5
\end{pmatrix}$$ \end{pmatrix}$$
- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is - Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$. $[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
)DOC"); )DOC");
...@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel { ...@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null"); "Input(Out@GRAD) should not be null");
...@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel { ...@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
} }
}; };
// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
// transpose, the XShape is used to carry the shape and lod of X which
// will be used in transpose_grad, in this way, the framework can reuse
// the memory of X immediately the transpose2_op is finished.
// Considering compatibility issues, we could not fix transpose2_op
class Transpose2Op : public TransposeOp {
public:
Transpose2Op(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: TransposeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
TransposeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) should not be null");
const auto &in_dims = ctx->GetInputDim("X");
std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
x_shape_dim[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
x_shape_dim[i + 1] = in_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
ctx->ShareLoD("X", /*->*/ "XShape");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
};
class Transpose2OpMaker : public TransposeOpMaker {
public:
void Make() override {
TransposeOpMaker::Make();
AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
}
};
class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("transpose2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Transpose2OpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
if (ctx->HasOutput(framework::GradVarName("X"))) {
auto xshape_dim = ctx->GetInputDim("XShape");
auto x_shape_dim =
framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
ctx->ShareLoD("XShape", framework::GradVarName("X"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
->type()),
ctx.device_context());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -120,8 +208,20 @@ namespace ops = paddle::operators; ...@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker, REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>); transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
ops::Transpose2GradMaker);
REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
REGISTER_OP_CPU_KERNEL(
transpose2,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
...@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
transpose2,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
...@@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
}); });
AddComment(R"DOC( AddComment(R"DOC(
Unsqueeze Operator. Unsqueeze Operator.
Insert single-dimensional entries to the shape of a tensor.
Takes one required argument axes, a list of dimensions that will be inserted.
Dimension indices in axes are as seen in the output tensor.
For example: Insert single-dimensional entries to the shape of a tensor.
Given a tensor such that tensor with shape [3, 4, 5], Takes one required argument axes, a list of dimensions that will be inserted.
Dimension indices in axes are as seen in the output tensor.
For example:
Given a tensor such that tensor with shape [3, 4, 5],
then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1] then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
)DOC"); )DOC");
} }
...@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase { ...@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
} }
}; };
// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
// unsqueeze, the XShape is used to carry the shape and lod of X which
// will be used in unsqueeze_grad, in this way, the framework can reuse
// the memory of X immediately the unsqueeze2_op is finished.
// Considering compatibility issues, we could not fix unsqueeze2_op
class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
UnsqueezeOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of Unsqueeze operator should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
public:
void Make() override {
UnsqueezeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in UnsqueezeGradOp.")
.AsIntermediate();
}
};
class Unsqueeze2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
// Invoke Reshape op.
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("unsqueeze2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Unsqueeze2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Unsqueeze2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker, ...@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp, REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
ops::UnsqueezeGradInferShape); ops::UnsqueezeGradInferShape);
REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
ops::Unsqueeze2GradInferShape);
...@@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, ...@@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
if (nullptr == dso_handle) { if (nullptr == dso_handle) {
LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
<< dlerror() << ")"; << dlerror() << ")";
if (dlPath.find("nccl") != std::string::npos) {
std::cout
<< "You may need to install 'nccl2' from NVIDIA official website: "
<< "https://developer.nvidia.com/nccl/nccl-download"
<< "before install PaddlePaddle" << std::endl;
}
dlPath = dso_name; dlPath = dso_name;
dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags); dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
} }
......
...@@ -100,14 +100,13 @@ struct NCCLContextMap { ...@@ -100,14 +100,13 @@ struct NCCLContextMap {
return; return;
} }
std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]); std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
// if pass nccl_id here, can assume we are doing multi node training // if num_trainers == 1, should create a new nccl id for local comms.
if (nccl_id == nullptr) { if (num_trainers == 1) {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex()); std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data())); comms.get(), static_cast<int>(order_.size()), order_.data()));
} else { } else {
PADDLE_ENFORCE_GT(num_trainers, 1); PADDLE_ENFORCE_NOT_NULL(nccl_id);
// TODO(wuyi): need to ensure each node have same number of GPUs
{ {
int nranks = num_trainers * order_.size(); int nranks = num_trainers * order_.size();
NCCLGroupGuard gurad; NCCLGroupGuard gurad;
......
...@@ -115,6 +115,7 @@ function cmake_gen() { ...@@ -115,6 +115,7 @@ function cmake_gen() {
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE=${WITH_INFERENCE:-ON}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
======================================== ========================================
...@@ -144,6 +145,7 @@ EOF ...@@ -144,6 +145,7 @@ EOF
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
} }
...@@ -498,7 +500,7 @@ EOF ...@@ -498,7 +500,7 @@ EOF
EOF EOF
if [[ ${WITH_GPU} == "ON" ]]; then if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&" NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
else else
NCCL_DEPS="" NCCL_DEPS=""
fi fi
...@@ -545,14 +547,14 @@ function gen_capi_package() { ...@@ -545,14 +547,14 @@ function gen_capi_package() {
rm -rf $install_prefix rm -rf $install_prefix
make DESTDIR="$install_prefix" install make DESTDIR="$install_prefix" install
cd $install_prefix/usr/local cd $install_prefix/usr/local
ls | egrep -v "^Found.*item$" | xargs tar -cf ${PADDLE_ROOT}/build/paddle.tgz ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
fi fi
} }
function gen_fluid_inference_lib() { function gen_fluid_inference_lib() {
mkdir -p ${PADDLE_ROOT}/build mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Deploying fluid inference library ... Deploying fluid inference library ...
...@@ -567,7 +569,7 @@ EOF ...@@ -567,7 +569,7 @@ EOF
} }
function test_fluid_inference_lib() { function test_fluid_inference_lib() {
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Testing fluid inference library ... Testing fluid inference library ...
......
...@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file, ...@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file,
pickle.dump( pickle.dump(
output, output,
open('%s/batch_%d' % (out_path, file_id), 'wb'), open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL) protocol=2)
file_id += 1 file_id += 1
data = [] data = []
labels = [] labels = []
...@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file, ...@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file,
output['label'] = labels output['label'] = labels
output['data'] = data output['data'] = data
pickle.dump( pickle.dump(
output, output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
with open(meta_file, 'a') as meta: with open(meta_file, 'a') as meta:
for file in os.listdir(out_path): for file in os.listdir(out_path):
......
...@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
return acc_out return acc_out
def auc(input, label, curve='ROC', num_thresholds=200, topk=1): def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
""" """
**Area Under the Curve (AUC) Layer** **Area Under the Curve (AUC) Layer**
...@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): ...@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
""" """
helper = LayerHelper("auc", **locals()) helper = LayerHelper("auc", **locals())
auc_out = helper.create_tmp_variable(dtype="float64") auc_out = helper.create_tmp_variable(dtype="float64")
batch_auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
tp = helper.create_global_variable( stat_pos = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) persistable=True, dtype='int64', shape=[num_thresholds + 1])
tn = helper.create_global_variable( stat_neg = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) persistable=True, dtype='int64', shape=[num_thresholds + 1])
fp = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) for var in [stat_pos, stat_neg]:
fn = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
for var in [tp, tn, fp, fn]:
helper.set_variable_initializer( helper.set_variable_initializer(
var, Constant( var, Constant(
value=0.0, force_cpu=True)) value=0.0, force_cpu=True))
...@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): ...@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
inputs={ inputs={
"Predict": [input], "Predict": [input],
"Label": [label], "Label": [label],
"TP": [tp], "StatPos": [stat_pos],
"TN": [tn], "StatNeg": [stat_neg]
"FP": [fp],
"FN": [fn]
}, },
attrs={"curve": curve, attrs={"curve": curve,
"num_thresholds": num_thresholds}, "num_thresholds": num_thresholds},
outputs={ outputs={
"AUC": [auc_out], "AUC": [auc_out],
"TPOut": [tp], "BatchAUC": [batch_auc_out],
"TNOut": [tn], "StatPosOut": [stat_pos],
"FPOut": [fp], "StatNegOut": [stat_neg]
"FNOut": [fn]
}) })
return auc_out, [tp, tn, fp, fn] return auc_out, batch_auc_out, [stat_pos, stat_neg]
...@@ -3546,11 +3546,6 @@ def topk(input, k, name=None): ...@@ -3546,11 +3546,6 @@ def topk(input, k, name=None):
top5_values, top5_indices = layers.topk(input, k=5) top5_values, top5_indices = layers.topk(input, k=5)
""" """
shape = input.shape
if k < 1 or k >= shape[-1]:
raise ValueError("k must be greater than 0 and less than %d." %
(shape[-1]))
helper = LayerHelper("top_k", **locals()) helper = LayerHelper("top_k", **locals())
values = helper.create_tmp_variable(dtype=input.dtype) values = helper.create_tmp_variable(dtype=input.dtype)
indices = helper.create_tmp_variable(dtype="int64") indices = helper.create_tmp_variable(dtype="int64")
...@@ -4030,10 +4025,12 @@ def transpose(x, perm, name=None): ...@@ -4030,10 +4025,12 @@ def transpose(x, perm, name=None):
helper = LayerHelper('transpose', **locals()) helper = LayerHelper('transpose', **locals())
out = helper.create_tmp_variable(x.dtype) out = helper.create_tmp_variable(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype)
helper.append_op( helper.append_op(
type='transpose', type='transpose2',
inputs={'X': [x]}, inputs={'X': [x]},
outputs={'Out': [out]}, outputs={'Out': [out],
'XShape': [x_shape]},
attrs={'axis': perm}) attrs={'axis': perm})
return out return out
...@@ -4503,7 +4500,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4503,7 +4500,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
""" """
if not (isinstance(shape, list) or isinstance(shape, tuple)): if not (isinstance(shape, list) or isinstance(shape, tuple)):
raise ValueError("Input shape must be a python lsit or tuple.") raise ValueError("Input shape must be a python list or tuple.")
inputs = {"X": x} inputs = {"X": x}
if isinstance(actual_shape, Variable): if isinstance(actual_shape, Variable):
inputs["Shape"] = actual_shape inputs["Shape"] = actual_shape
...@@ -4525,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4525,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"Each dimension size given in shape must not be negtive " "Each dimension size given in shape must not be negtive "
"except one unknown dimension.") "except one unknown dimension.")
helper = LayerHelper("reshape", **locals()) helper = LayerHelper("reshape2", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_tmp_variable(dtype=x.dtype)
x_shape = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op( helper.append_op(
type="reshape", type="reshape2",
inputs=inputs, inputs=inputs,
attrs={"shape": shape}, attrs={"shape": shape},
outputs={"Out": out}) outputs={"Out": out,
"XShape": x_shape})
return helper.append_activation(out) return helper.append_activation(out)
...@@ -4575,11 +4574,13 @@ def squeeze(input, axes, name=None): ...@@ -4575,11 +4574,13 @@ def squeeze(input, axes, name=None):
""" """
helper = LayerHelper("squeeze", **locals()) helper = LayerHelper("squeeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_tmp_variable(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op( helper.append_op(
type="squeeze", type="squeeze2",
inputs={"X": input}, inputs={"X": input},
attrs={"axes": axes}, attrs={"axes": axes},
outputs={"Out": out}) outputs={"Out": out,
"XShape": x_shape})
return out return out
...@@ -4610,11 +4611,13 @@ def unsqueeze(input, axes, name=None): ...@@ -4610,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
""" """
helper = LayerHelper("unsqueeze", **locals()) helper = LayerHelper("unsqueeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_tmp_variable(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op( helper.append_op(
type="unsqueeze", type="unsqueeze2",
inputs={"X": input}, inputs={"X": input},
attrs={"axes": axes}, attrs={"axes": axes},
outputs={"Out": out}) outputs={"Out": out,
"XShape": x_shape})
return out return out
...@@ -5816,10 +5819,12 @@ def flatten(x, axis=1, name=None): ...@@ -5816,10 +5819,12 @@ def flatten(x, axis=1, name=None):
raise ValueError("The axis should be a int, and in range [0, rank(x)]") raise ValueError("The axis should be a int, and in range [0, rank(x)]")
out = helper.create_tmp_variable(x.dtype) out = helper.create_tmp_variable(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype)
helper.append_op( helper.append_op(
type='flatten', type='flatten2',
inputs={"X": x}, inputs={"X": x},
outputs={'Out': out}, outputs={'Out': out,
'XShape': x_shape},
attrs={"axis": axis}) attrs={"axis": axis})
return out return out
......
...@@ -558,8 +558,6 @@ class Auc(MetricBase): ...@@ -558,8 +558,6 @@ class Auc(MetricBase):
name: metric name name: metric name
curve: Specifies the name of the curve to be computed, 'ROC' [default] or curve: Specifies the name of the curve to be computed, 'ROC' [default] or
'PR' for the Precision-Recall-curve. 'PR' for the Precision-Recall-curve.
num_thresholds: The number of thresholds to use when discretizing the roc
curve.
"NOTE: only implement the ROC curve type via Python now." "NOTE: only implement the ROC curve type via Python now."
...@@ -574,15 +572,14 @@ class Auc(MetricBase): ...@@ -574,15 +572,14 @@ class Auc(MetricBase):
numpy_auc = metric.eval() numpy_auc = metric.eval()
""" """
def __init__(self, name, curve='ROC', num_thresholds=200): def __init__(self, name, curve='ROC', num_thresholds=4095):
super(Auc, self).__init__(name=name) super(Auc, self).__init__(name=name)
self._curve = curve self._curve = curve
self._num_thresholds = num_thresholds self._num_thresholds = num_thresholds
self._epsilon = 1e-6
self.tp_list = np.zeros((num_thresholds, )) _num_pred_buckets = num_thresholds + 1
self.fn_list = np.zeros((num_thresholds, )) self._stat_pos = [0] * _num_pred_buckets
self.tn_list = np.zeros((num_thresholds, )) self._stat_neg = [0] * _num_pred_buckets
self.fp_list = np.zeros((num_thresholds, ))
def update(self, preds, labels): def update(self, preds, labels):
if not _is_numpy_(labels): if not _is_numpy_(labels):
...@@ -590,41 +587,32 @@ class Auc(MetricBase): ...@@ -590,41 +587,32 @@ class Auc(MetricBase):
if not _is_numpy_(preds): if not _is_numpy_(preds):
raise ValueError("The 'predictions' must be a numpy ndarray.") raise ValueError("The 'predictions' must be a numpy ndarray.")
kepsilon = 1e-7 # to account for floating point imprecisions for i, lbl in enumerate(labels):
thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1) value = preds[i, 1]
for i in range(self._num_thresholds - 2)] bin_idx = int(value * self._num_thresholds)
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] assert bin_idx <= self._num_thresholds
if lbl:
# calculate TP, FN, TN, FP count self._stat_pos[bin_idx] += 1.0
for idx_thresh, thresh in enumerate(thresholds): else:
tp, fn, tn, fp = 0, 0, 0, 0 self._stat_neg[bin_idx] += 1.0
for i, lbl in enumerate(labels):
if lbl: @staticmethod
if preds[i, 1] >= thresh: def trapezoid_area(x1, x2, y1, y2):
tp += 1 return abs(x1 - x2) * (y1 + y2) / 2.0
else:
fn += 1
else:
if preds[i, 1] >= thresh:
fp += 1
else:
tn += 1
self.tp_list[idx_thresh] += tp
self.fn_list[idx_thresh] += fn
self.tn_list[idx_thresh] += tn
self.fp_list[idx_thresh] += fp
def eval(self): def eval(self):
epsilon = self._epsilon tot_pos = 0.0
num_thresholds = self._num_thresholds tot_neg = 0.0
tpr = (self.tp_list.astype("float32") + epsilon) / ( auc = 0.0
self.tp_list + self.fn_list + epsilon)
fpr = self.fp_list.astype("float32") / ( idx = self._num_thresholds
self.fp_list + self.tn_list + epsilon) while idx >= 0:
rec = (self.tp_list.astype("float32") + epsilon) / ( tot_pos_prev = tot_pos
self.tp_list + self.fp_list + epsilon) tot_neg_prev = tot_neg
tot_pos += self._stat_pos[idx]
x = fpr[:num_thresholds - 1] - fpr[1:] tot_neg += self._stat_neg[idx]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
auc_value = np.sum(x * y) tot_pos_prev)
return auc_value idx -= 1
return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
...@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer): ...@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer):
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t)
if centered is True:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
\\epsilon}} \\nabla Q_{i}(w) \\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t) w & = w - v(w, t)
...@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer): ...@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
avoid division by zero, set 1e-6 by default. avoid division by zero, set 1e-6 by default.
momentum(float): :math:`\\beta` in equation is the momentum term, momentum(float): :math:`\\beta` in equation is the momentum term,
set 0.0 by default. set 0.0 by default.
centered(bool): If True, gradients are normalized by the estimated variance of
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer): ...@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer):
_momentum_acc_str = "momentum" _momentum_acc_str = "momentum"
_mean_square_acc_str = "mean_square" _mean_square_acc_str = "mean_square"
_mean_grad_acc_str = "mean_grad"
def __init__(self, def __init__(self,
learning_rate, learning_rate,
rho=0.95, rho=0.95,
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0, momentum=0.0,
centered=False,
**kwargs): **kwargs):
super(RMSPropOptimizer, self).__init__( super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate, **kwargs)
...@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer): ...@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
self._rho = rho self._rho = rho
self._epsilon = epsilon self._epsilon = epsilon
self._momentum = momentum self._momentum = momentum
self._centered = centered
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
...@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer): ...@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
for p in parameters: for p in parameters:
self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
...@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer): ...@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
mean_square_acc = self._get_accumulator(self._mean_square_acc_str, mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
param_and_grad[0]) param_and_grad[0])
mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
param_and_grad[0])
rmsprop_op = block.append_op( rmsprop_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
...@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer): ...@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Moment": momentum_acc, "Moment": momentum_acc,
"MeanSquare": mean_square_acc, "MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
"MomentOut": momentum_acc, "MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc "MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc
}, },
attrs={ attrs={
"epsilon": self._epsilon, "epsilon": self._epsilon,
"decay": self._rho, "decay": self._rho,
"momentum": self._momentum "momentum": self._momentum,
"centered": self._centered
}) })
return rmsprop_op return rmsprop_op
......
...@@ -43,8 +43,9 @@ class ParallelExecutor(object): ...@@ -43,8 +43,9 @@ class ParallelExecutor(object):
num_trainers(int): If greater than 1, NCCL will be initialized with num_trainers(int): If greater than 1, NCCL will be initialized with
multiple rank of nodes, each node should have same number of GPUs. multiple rank of nodes, each node should have same number of GPUs.
Distributed training will be enabled then. Default 1. Distributed training will be enabled then. Default 1.
trainer_id(int: Must use together with num_trainers. trainer_id is the trainer_id(int): Must use together with num_trainers. trainer_id is the
"rank" of current node starts from 0. Default 0. "rank" of current node starts from 0. Default 0.
scope(Scope): scope to run with, default use fluid.global_scope().
Returns: Returns:
ParallelExecutor: The initialized ParallelExecutor object. ParallelExecutor: The initialized ParallelExecutor object.
...@@ -73,6 +74,7 @@ class ParallelExecutor(object): ...@@ -73,6 +74,7 @@ class ParallelExecutor(object):
build_strategy=None, build_strategy=None,
num_trainers=1, num_trainers=1,
trainer_id=0, trainer_id=0,
scope=None,
**kwargs): **kwargs):
if len(kwargs) != 0: if len(kwargs) != 0:
err_msg = "" err_msg = ""
...@@ -131,7 +133,8 @@ class ParallelExecutor(object): ...@@ -131,7 +133,8 @@ class ParallelExecutor(object):
main = main_program main = main_program
main = main if main else framework.default_main_program() main = main if main else framework.default_main_program()
scope = executor.global_scope() if scope == None:
scope = executor.global_scope()
# FIXME(Yancey1989): it's a temporary approach to determinate the distribute # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
# train program, call self.bcast_param() at the end of each mini-batch. # train program, call self.bcast_param() at the end of each mini-batch.
self.is_dist = True if "recv" in [ self.is_dist = True if "recv" in [
......
...@@ -47,14 +47,14 @@ def train_program(): ...@@ -47,14 +47,14 @@ def train_program():
loss = fluid.layers.square_error_cost(input=y_predict, label=y) loss = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
return avg_loss return [avg_loss, y_predict]
def optimizer_func(): def optimizer_func():
return fluid.optimizer.SGD(learning_rate=0.001) return fluid.optimizer.SGD(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname): def train(use_cuda, train_program, params_dirname, inference_model_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = fluid.Trainer(
...@@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname): ...@@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname):
''' '''
if params_dirname is not None: if params_dirname is not None:
trainer.save_params(params_dirname) trainer.save_params(params_dirname)
trainer.save_inference_model(inference_model_dirname,
['x'], [1])
trainer.stop() trainer.stop()
trainer.train( trainer.train(
...@@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0]) print("infer results: ", results[0])
def infer_by_saved_model(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# The input's dimension should be 2-D and the second dim is 13
# The input data should be >= 0
batch_size = 10
test_reader = paddle.batch(
paddle.dataset.uci_housing.test(), batch_size=batch_size)
test_data = next(test_reader())
test_feat = numpy.array(
[data[0] for data in test_data]).astype("float32")
test_label = numpy.array(
[data[1] for data in test_data]).astype("float32")
assert feed_target_names[0] == 'x'
results = exe.run(inference_program,
feed={feed_target_names[0]: numpy.array(test_feat)},
fetch_list=fetch_targets)
print("infer shape: ", results[0].shape)
print("infer results: ", results[0])
print("ground truth: ", test_label)
def main(use_cuda): def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda(): if use_cuda and not fluid.core.is_compiled_with_cuda():
return return
# Directory for saving the trained model # Directory for saving the trained model
params_dirname = "fit_a_line.inference.model" params_dirname = "fit_a_line.model"
inference_model_dirname = "fit_a_line.inference_model"
train(use_cuda, train_program, params_dirname) train(use_cuda, train_program, params_dirname, inference_model_dirname)
infer(use_cuda, inference_program, params_dirname) infer(use_cuda, inference_program, params_dirname)
infer_by_saved_model(use_cuda, inference_model_dirname)
class TestFitALine(unittest.TestCase): class TestFitALine(unittest.TestCase):
......
...@@ -18,6 +18,7 @@ import paddle ...@@ -18,6 +18,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy import numpy
import six
import os import os
import cifar10_small_test_set import cifar10_small_test_set
...@@ -177,4 +178,7 @@ if __name__ == '__main__': ...@@ -177,4 +178,7 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
main(use_cuda=use_cuda, parallel=parallel) # TODO(minqiyang): remove this line after fixing the deletion
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -18,6 +18,7 @@ import paddle ...@@ -18,6 +18,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy import numpy
import six
import os import os
import cifar10_small_test_set import cifar10_small_test_set
...@@ -151,4 +152,7 @@ if __name__ == '__main__': ...@@ -151,4 +152,7 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
main(use_cuda=use_cuda, parallel=parallel) # TODO(minqiyang): remove this line after fixing the deletion
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -18,6 +18,7 @@ import argparse ...@@ -18,6 +18,7 @@ import argparse
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle import paddle
import six
import sys import sys
import numpy import numpy
import unittest import unittest
...@@ -154,4 +155,7 @@ if __name__ == '__main__': ...@@ -154,4 +155,7 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
main(use_cuda=use_cuda, parallel=parallel) # TODO(minqiyang): remove this line after fixing the deletion
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -18,6 +18,7 @@ import argparse ...@@ -18,6 +18,7 @@ import argparse
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle import paddle
import six
import sys import sys
import numpy import numpy
import unittest import unittest
...@@ -136,4 +137,7 @@ if __name__ == '__main__': ...@@ -136,4 +137,7 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
main(use_cuda=use_cuda, parallel=parallel) # TODO(minqiyang): remove this line after fixing the deletion
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -36,6 +36,7 @@ import paddle.fluid as fluid ...@@ -36,6 +36,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid import core from paddle.fluid import core
from test_dist_base import TestDistRunnerBase, runtime_main from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.compat as cpt
from paddle.compat import long_type from paddle.compat import long_type
import hashlib import hashlib
...@@ -315,8 +316,9 @@ def pad_batch_data(insts, ...@@ -315,8 +316,9 @@ def pad_batch_data(insts,
""" """
return_list = [] return_list = []
max_len = max(len(inst) for inst in insts) max_len = max(len(inst) for inst in insts)
num_token = reduce(lambda x, y: x + y, num_token = six.moves.reduce(
[len(inst) for inst in insts]) if return_num_token else 0 lambda x, y: x + y,
[len(inst) for inst in insts]) if return_num_token else 0
# Any token included in dict can be used to pad, since the paddings' loss # Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients. # will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array( inst_data = np.array(
...@@ -328,7 +330,7 @@ def pad_batch_data(insts, ...@@ -328,7 +330,7 @@ def pad_batch_data(insts,
return_list += [inst_weight.astype("float32").reshape([-1, 1])] return_list += [inst_weight.astype("float32").reshape([-1, 1])]
else: # position data else: # position data
inst_pos = np.array([ inst_pos = np.array([
range(1, len(inst) + 1) + [0] * (max_len - len(inst)) list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
for inst in insts for inst in insts
]) ])
return_list += [inst_pos.astype("int64").reshape([-1, 1])] return_list += [inst_pos.astype("int64").reshape([-1, 1])]
...@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx, ...@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
return_num_token=True) return_num_token=True)
data_input_dict = dict( data_input_dict = dict(
zip(data_input_names, [ list(
src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, zip(data_input_names, [
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
])) trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
])))
return data_input_dict, np.asarray([num_token], dtype="float32") return data_input_dict, np.asarray([num_token], dtype="float32")
...@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, ...@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
np.log(TrainTaskConfig.label_smooth_eps / ( np.log(TrainTaskConfig.label_smooth_eps / (
ModelHyperParams.trg_vocab_size - 1) + 1e-20)) ModelHyperParams.trg_vocab_size - 1) + 1e-20))
init = False init = False
for pass_id in xrange(TrainTaskConfig.pass_num): for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time() pass_start_time = time.time()
for batch_id, data in enumerate(train_data()): for batch_id, data in enumerate(train_data()):
if batch_id >= 5: if batch_id >= 5:
...@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, ...@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.eos_idx, ModelHyperParams.n_head,
ModelHyperParams.d_model) ModelHyperParams.d_model)
total_num_token += num_token total_num_token += num_token
feed_kv_pairs = data_input_dict.items() feed_kv_pairs = list(data_input_dict.items())
if TrainTaskConfig.local: if TrainTaskConfig.local:
feed_kv_pairs += { feed_kv_pairs += list({
lr_scheduler.learning_rate.name: lr_rate lr_scheduler.learning_rate.name: lr_rate
}.items() }.items())
feed_list.append(dict(feed_kv_pairs)) feed_list.append(dict(feed_kv_pairs))
if not init: if not init:
...@@ -873,6 +876,7 @@ class DataReader(object): ...@@ -873,6 +876,7 @@ class DataReader(object):
f = tarfile.open(fpaths[0], "r") f = tarfile.open(fpaths[0], "r")
for line in f.extractfile(tar_fname): for line in f.extractfile(tar_fname):
line = cpt.to_text(line)
fields = line.strip("\n").split(self._field_delimiter) fields = line.strip("\n").split(self._field_delimiter)
if (not self._only_src and len(fields) == 2) or ( if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1): self._only_src and len(fields) == 1):
...@@ -882,8 +886,9 @@ class DataReader(object): ...@@ -882,8 +886,9 @@ class DataReader(object):
if not os.path.isfile(fpath): if not os.path.isfile(fpath):
raise IOError("Invalid file: %s" % fpath) raise IOError("Invalid file: %s" % fpath)
with open(fpath, "r") as f: with open(fpath, "rb") as f:
for line in f: for line in f:
line = cpt.to_text(line)
fields = line.strip("\n").split(self._field_delimiter) fields = line.strip("\n").split(self._field_delimiter)
if (not self._only_src and len(fields) == 2) or ( if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1): self._only_src and len(fields) == 1):
...@@ -892,8 +897,9 @@ class DataReader(object): ...@@ -892,8 +897,9 @@ class DataReader(object):
@staticmethod @staticmethod
def load_dict(dict_path, reverse=False): def load_dict(dict_path, reverse=False):
word_dict = {} word_dict = {}
with open(dict_path, "r") as fdict: with open(dict_path, "rb") as fdict:
for idx, line in enumerate(fdict): for idx, line in enumerate(fdict):
line = cpt.to_text(line)
if reverse: if reverse:
word_dict[idx] = line.strip("\n") word_dict[idx] = line.strip("\n")
else: else:
...@@ -1034,7 +1040,7 @@ def multi_head_attention(queries, ...@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
# size of the input as the output dimension size. # size of the input as the output dimension size.
return layers.reshape( return layers.reshape(
x=trans_x, x=trans_x,
shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])) shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
""" """
......
...@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase): ...@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
outs, _ = self._calc_output(place) outs, _ = self._calc_output(place)
return outs return outs
def _calc_output(self, place, parallel=False): def _calc_output(self, place, parallel=False, no_check_set=None):
program = Program() program = Program()
block = program.global_block() block = program.global_block()
...@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase): ...@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
# if not, fill the fetch_list by the user configured outputs in test. # if not, fill the fetch_list by the user configured outputs in test.
if len(fetch_list) == 0: if len(fetch_list) == 0:
for var_name, var in six.iteritems(outputs): for var_name, var in six.iteritems(outputs):
if no_check_set is not None and var_name in no_check_set:
continue
if isinstance(var, list): if isinstance(var, list):
for v in var: for v in var:
fetch_list.append(v) fetch_list.append(v)
...@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase): ...@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
return_numpy=False) return_numpy=False)
return outs, fetch_list return outs, fetch_list
def check_output_with_place(self, place, atol): def check_output_with_place(self,
outs, fetch_list = self._calc_output(place) place,
atol,
no_check_set=None,
equal_nan=False):
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs: if out_name not in self.outputs:
continue continue
if no_check_set is not None and out_name in no_check_set:
continue
def find_actual(target_name, fetch_list): def find_actual(target_name, fetch_list):
found = [ found = [
...@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase): ...@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase):
if isinstance(expect, tuple) else expect if isinstance(expect, tuple) else expect
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " + "Output (" + sub_out_name + ") has diff at " +
str(place)) str(place))
if isinstance(expect, tuple): if isinstance(expect, tuple):
...@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase): ...@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase):
expect_t = expect[0] if isinstance(expect, tuple) else expect expect_t = expect[0] if isinstance(expect, tuple) else expect
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t)) str(actual_t))
...@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase): ...@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
return places return places
def check_output(self, atol=1e-5): def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
places = self._get_places() places = self._get_places()
for place in places: for place in places:
self.check_output_with_place(place, atol) self.check_output_with_place(place, atol, no_check_set, equal_nan)
def check_output_customized(self, checker): def check_output_customized(self, checker):
places = self._get_places() places = self._get_places()
......
...@@ -26,18 +26,15 @@ class TestAucOp(OpTest): ...@@ -26,18 +26,15 @@ class TestAucOp(OpTest):
pred = np.random.random((128, 2)).astype("float32") pred = np.random.random((128, 2)).astype("float32")
labels = np.random.randint(0, 2, (128, 1)) labels = np.random.randint(0, 2, (128, 1))
num_thresholds = 200 num_thresholds = 200
tp = np.zeros((num_thresholds, )).astype("int64")
tn = np.zeros((num_thresholds, )).astype("int64") stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
fp = np.zeros((num_thresholds, )).astype("int64") stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
fn = np.zeros((num_thresholds, )).astype("int64")
self.inputs = { self.inputs = {
'Predict': pred, 'Predict': pred,
'Label': labels, 'Label': labels,
'TP': tp, "StatPos": stat_pos,
'TN': tn, "StatNeg": stat_neg
'FP': fp,
'FN': fn
} }
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
...@@ -47,11 +44,10 @@ class TestAucOp(OpTest): ...@@ -47,11 +44,10 @@ class TestAucOp(OpTest):
python_auc.update(pred, labels) python_auc.update(pred, labels)
self.outputs = { self.outputs = {
'AUC': python_auc.eval(), 'AUC': np.array(python_auc.eval()),
'TPOut': python_auc.tp_list, 'BatchAUC': np.array(python_auc.eval()),
'FNOut': python_auc.fn_list, 'StatPosOut': np.array(python_auc._stat_pos),
'TNOut': python_auc.tn_list, 'StatNegOut': np.array(python_auc._stat_neg)
'FPOut': python_auc.fp_list
} }
def test_check_output(self): def test_check_output(self):
......
...@@ -55,6 +55,7 @@ class TestDistRunnerBase(object): ...@@ -55,6 +55,7 @@ class TestDistRunnerBase(object):
pserver_prog = t.get_pserver_program(args.current_endpoint) pserver_prog = t.get_pserver_program(args.current_endpoint)
startup_prog = t.get_startup_program(args.current_endpoint, startup_prog = t.get_startup_program(args.current_endpoint,
pserver_prog) pserver_prog)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
...@@ -147,6 +148,8 @@ def runtime_main(test_class): ...@@ -147,6 +148,8 @@ def runtime_main(test_class):
import paddle.compat as cpt import paddle.compat as cpt
import socket
from contextlib import closing
class TestDistBase(unittest.TestCase): class TestDistBase(unittest.TestCase):
...@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase): ...@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase):
def setUp(self): def setUp(self):
self._trainers = 2 self._trainers = 2
self._pservers = 2 self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port())
self._python_interp = "python" self._python_interp = "python"
self._sync_mode = True self._sync_mode = True
self._mem_opt = False self._mem_opt = False
self._use_reduce = False self._use_reduce = False
self._setup_config() self._setup_config()
def _find_free_port(self):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
def start_pserver(self, model_file, check_error_log): def start_pserver(self, model_file, check_error_log):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
......
...@@ -22,14 +22,17 @@ from op_test import OpTest ...@@ -22,14 +22,17 @@ from op_test import OpTest
class TestFlattenOp(OpTest): class TestFlattenOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "flatten" self.op_type = "flatten2"
self.init_test_case() self.init_test_case()
self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
self.init_attrs() self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.in_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=["XShape"])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
......
...@@ -53,11 +53,11 @@ class TestFusionLSTMOp(OpTest): ...@@ -53,11 +53,11 @@ class TestFusionLSTMOp(OpTest):
self.M = 8 self.M = 8
self.D = 16 self.D = 16
self.has_initial_state = False self.has_initial_state = False
self.use_peepholes = False
self.is_reverse = False self.is_reverse = False
self.act_gate = 'sigmoid' self.act_gate = 'sigmoid'
self.act_cell = 'tanh' self.act_cell = 'tanh'
self.act_cand = 'tanh' self.act_cand = 'tanh'
self.use_peepholes = False
self.set_conf() self.set_conf()
T = sum(self.lod[0]) T = sum(self.lod[0])
...@@ -159,5 +159,36 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp): ...@@ -159,5 +159,36 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp):
self.D = 16 self.D = 16
class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.has_initial_state = True
class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.is_reverse = True
class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.has_initial_state = True
self.is_reverse = True
class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.lod = [[2]]
self.D = 8
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase): ...@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
assert not math.isnan(np.sum(ret[i])) and \ assert not math.isnan(np.sum(ret[i])) and \
not math.isinf(np.sum(ret[i])) not math.isinf(np.sum(ret[i]))
@unittest.skip(reason="CI timeout")
def test_fetch_op(self): def test_fetch_op(self):
tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
tst_reader_iter = tst_reader() tst_reader_iter = tst_reader()
...@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase): ...@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase):
if batch_id == 2: if batch_id == 2:
break break
@unittest.skip(reason="CI timeout")
def test_feed_op(self): def test_feed_op(self):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
......
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import six
from op_test import OpTest from op_test import OpTest
...@@ -62,17 +63,20 @@ class PReluTest(OpTest): ...@@ -62,17 +63,20 @@ class PReluTest(OpTest):
# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
# class TestCase1(PReluTest): if six.PY2:
# def initTestCase(self):
# self.attrs = {'mode': "all"}
# class TestCase2(PReluTest): class TestCase1(PReluTest):
# def initTestCase(self): def initTestCase(self):
# self.attrs = {'mode': "channel"} self.attrs = {'mode': "all"}
class TestCase2(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "channel"}
class TestCase3(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "element"}
# class TestCase3(PReluTest):
# def initTestCase(self):
# self.attrs = {'mode': "element"}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -22,106 +22,39 @@ from op_test import OpTest ...@@ -22,106 +22,39 @@ from op_test import OpTest
class TestReshapeOp(OpTest): class TestReshapeOp(OpTest):
def setUp(self): def setUp(self):
ori_shape = (2, 25) self.init_data()
new_shape = (5, 10) self.op_type = "reshape2"
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.op_type = "reshape" self.attrs = {"shape": self.new_shape}
self.inputs = {"X": np.random.random(ori_shape).astype("float32")} self.outputs = {
self.attrs = {"shape": new_shape} "Out": self.inputs["X"].reshape(self.infered_shape),
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} 'XShape': np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer1(OpTest):
def setUp(self):
ori_shape = (5, 10)
new_shape = (5, -1, 5)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer2(OpTest):
def setUp(self):
ori_shape = (2, 2, 6)
new_shape = (2, 0, 3, -1)
infered_shape = (2, 2, 3, -1)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpInplace(OpTest):
def setUp(self):
ori_shape = (2, 25)
new_shape = (5, 10)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInferInplace1(OpTest):
def setUp(self):
ori_shape = (5, 10)
new_shape = (5, -1, 5)
self.op_type = "reshape" def init_data(self):
self.inputs = {"X": np.random.random(ori_shape).astype("float32")} self.ori_shape = (2, 25)
self.attrs = {"shape": new_shape} self.new_shape = (5, 10)
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} self.infered_shape = (5, 10)
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
class TestReshapeOpDimInferInplace2(OpTest): class TestReshapeOpDimInfer1(TestReshapeOp):
def setUp(self): def init_data(self):
ori_shape = (2, 2, 6) self.ori_shape = (5, 10)
new_shape = (2, 0, 3, -1) self.new_shape = (5, -1, 5)
infered_shape = (2, 2, 3, -1) self.infered_shape = (5, -1, 5)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self): class TestReshapeOpDimInfer2(TestReshapeOp):
self.check_grad(["X"], "Out") def init_data(self):
self.ori_shape = (2, 2, 6)
self.new_shape = (2, 0, 3, -1)
self.infered_shape = (2, 2, 3, -1)
class TestReshapeOpWithInputShape(OpTest): class TestReshapeOpWithInputShape(OpTest):
...@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest): ...@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
new_shape = (0, -1, 5) new_shape = (0, -1, 5)
actual_shape = (2, 3, 5) actual_shape = (2, 3, 5)
self.op_type = "reshape" self.op_type = "reshape2"
self.inputs = { self.inputs = {
"X": np.random.random(ori_shape).astype("float32"), "X": np.random.random(ori_shape).astype("float32"),
"Shape": np.array( "Shape": np.array(
actual_shape, dtype="int32") actual_shape, dtype="int32")
} }
self.attrs = {"shape": new_shape} self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(actual_shape),
'XShape': np.random.random(ori_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out", sum_outputs=["Out"])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,90 +15,164 @@ ...@@ -15,90 +15,164 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestRmspropOp1(OpTest):
''' Test RMSProp with explicit inputs class TestBase(unittest.TestCase):
''' def setup(self, centered, epsilon=1e-6):
np.random.seed(5) # fix seed
def setUp(self):
self.op_type = "rmsprop" self.param_name = "param"
self.param = np.random.random((123, 321)).astype("float32")
param = np.random.random((123, 321)).astype("float32")
mean_square = np.random.random((123, 321)).astype("float32") self.mean_square_name = "mean_square"
learning_rate = np.array([0.01]).astype("float32") self.mean_square = np.random.random((123, 321)).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
moment = np.zeros((123, 321)).astype("float32") self.mean_grad_name = "mean_grad"
self.mean_grad = np.random.random((123, 321)).astype("float32")
epsilon = 1e-6
decay = 0.9 self.lr_name = "lr"
momentum = 0.0 self.learning_rate = np.array([0.01]).astype("float32")
self.inputs = { self.grad_name = "grad"
'Param': param, self.grad = np.random.random((123, 321)).astype("float32")
'MeanSquare': mean_square,
'LearningRate': learning_rate, self.moment_name = "moment"
'Grad': grad, self.moment = np.zeros((123, 321)).astype("float32")
'Moment': moment,
} self.epsilon = epsilon
self.decay = 0.9
self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum} self.momentum = 0.0
self.centered = centered
ms_out = decay * mean_square + (1 - decay) * grad * grad
moment_out = momentum * moment + \ self.ms_out = self.decay * self.mean_square + (1 - self.decay
learning_rate * grad / np.sqrt(ms_out + epsilon) ) * self.grad * self.grad
param_out = param - moment_out if centered:
self.mg_out = self.decay * self.mean_grad + (1 - self.decay
self.outputs = { ) * self.grad
'ParamOut': param_out, self.moment_out = self.momentum * self.moment + \
'MomentOut': moment_out, self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
'MeanSquareOut': ms_out else:
} self.moment_out = self.momentum * self.moment + \
self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
def test_check_output(self):
self.check_output() self.param_out = self.param - self.moment_out
def check(self,
class TestRmspropOp2(OpTest): actual_t,
'''Test RMSProp with default values for attributes expect_t,
''' place,
out_name,
def setUp(self): atol=1e-5,
self.op_type = "rmsprop" equal_nan=False):
self.assertTrue(
param = np.random.random((123, 321)).astype("float32") np.allclose(
mean_square = np.random.random((123, 321)).astype("float32") actual_t, expect_t, atol=atol, equal_nan=equal_nan),
learning_rate = np.array([0.01]).astype("float32") "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
grad = np.random.random((123, 321)).astype("float32") + str(expect_t) + "\n" + "But Got" + str(actual_t))
moment = np.zeros((123, 321)).astype("float32")
epsilon = 1.0e-10 class TestRmspropOp(TestBase):
decay = 0.9 def check_with_place(self, place, centered, epsilon):
momentum = 0.0 self.setup(centered, epsilon)
scope = core.Scope()
self.inputs = {
'Param': param, # create and initialize Param Variable
'MeanSquare': mean_square, param = scope.var(self.param_name).get_tensor()
'LearningRate': learning_rate, param.set(self.param, place)
'Grad': grad,
'Moment': moment, mean_square = scope.var(self.mean_square_name).get_tensor()
} mean_square.set(self.mean_square, place)
ms_out = decay * mean_square + (1 - decay) * grad * grad lr = scope.var(self.lr_name).get_tensor()
moment_out = momentum * moment + \ lr.set(self.learning_rate, place)
learning_rate * grad / np.sqrt(ms_out + epsilon)
param_out = param - moment_out grad = scope.var(self.grad_name).get_tensor()
grad.set(self.grad, place)
self.outputs = {
'ParamOut': param_out, moment = scope.var(self.moment_name).get_tensor()
'MomentOut': moment_out, moment.set(self.moment, place)
'MeanSquareOut': ms_out
} # create and run sgd operator
def test_check_output(self): if self.centered:
self.check_output() mean_grad = scope.var(self.mean_grad_name).get_tensor()
mean_grad.set(self.mean_grad, place)
rmsprop_op = Operator(
"rmsprop",
Param=self.param_name,
Grad=self.grad_name,
MeanSquare=self.mean_square_name,
MeanGrad=self.mean_grad_name,
Moment=self.moment_name,
LearningRate=self.lr_name,
ParamOut=self.param_name,
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
MeanGradOut=self.mean_grad_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=True)
else:
rmsprop_op = Operator(
"rmsprop",
Param=self.param_name,
Grad=self.grad_name,
MeanSquare=self.mean_square_name,
Moment=self.moment_name,
LearningRate=self.lr_name,
ParamOut=self.param_name,
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=False)
rmsprop_op.run(scope, place)
atol = 1e-5
equal_nan = False
if self.centered:
atol = 1e-3
equal_nan = True
self.check(
np.array(mean_square), self.ms_out, place, self.mean_square_name)
self.check(
np.array(moment),
self.moment_out,
place,
self.moment_name,
atol=atol,
equal_nan=equal_nan)
self.check(
np.array(param),
self.param_out,
place,
self.param_name,
atol=atol,
equal_nan=equal_nan)
if self.centered:
self.check(
np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
def test_rmsprop(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place, False, 1e-6)
self.check_with_place(place, False, 1e-10)
self.check_with_place(place, True, 1e-6)
self.check_with_place(place, True, 1e-10)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -23,14 +23,17 @@ from op_test import OpTest ...@@ -23,14 +23,17 @@ from op_test import OpTest
# Correct: General. # Correct: General.
class TestSqueezeOp(OpTest): class TestSqueezeOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "squeeze" self.op_type = "squeeze2"
self.init_test_case() self.init_test_case()
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs() self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
......
...@@ -22,16 +22,19 @@ from op_test import OpTest ...@@ -22,16 +22,19 @@ from op_test import OpTest
class TestTransposeOp(OpTest): class TestTransposeOp(OpTest):
def setUp(self): def setUp(self):
self.initTestCase() self.initTestCase()
self.op_type = "transpose" self.op_type = "transpose2"
self.inputs = {'X': np.random.random(self.shape).astype("float32")} self.inputs = {'X': np.random.random(self.shape).astype("float32")}
self.attrs = {'axis': list(self.axis)} self.attrs = {'axis': list(self.axis)}
self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} self.outputs = {
'XShape': np.random.random(self.shape).astype("float32"),
'Out': self.inputs['X'].transpose(self.axis)
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out', sum_outputs=['Out'])
def initTestCase(self): def initTestCase(self):
self.shape = (3, 4) self.shape = (3, 4)
......
...@@ -24,13 +24,16 @@ from op_test import OpTest ...@@ -24,13 +24,16 @@ from op_test import OpTest
class TestUnsqueezeOp(OpTest): class TestUnsqueezeOp(OpTest):
def setUp(self): def setUp(self):
self.init_test_case() self.init_test_case()
self.op_type = "unsqueeze" self.op_type = "unsqueeze2"
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs() self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=["XShape"])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
......
...@@ -431,6 +431,28 @@ class Trainer(object): ...@@ -431,6 +431,28 @@ class Trainer(object):
exe = executor.Executor(self.place) exe = executor.Executor(self.place)
io.save_persistables(exe, dirname=param_path) io.save_persistables(exe, dirname=param_path)
def save_inference_model(self, param_path, feeded_var_names,
target_var_indexes):
"""
Save model for cpp inference into :code:`param_path`.
Args:
param_path(str): The path to save parameters.
feeded_var_names(list(str)): The name of the vars that you
need to feed in before run program.
target_var_indexes(list(int)): the index of target var that
you need to return in trainer.train_func.
Returns:
None
"""
with self._prog_and_scope_guard():
exe = executor.Executor(self.place)
target_vars = [
self.train_func_outputs[index] for index in target_var_indexes
]
io.save_inference_model(param_path, feeded_var_names, target_vars,
exe)
@contextlib.contextmanager @contextlib.contextmanager
def _prog_and_scope_guard(self): def _prog_and_scope_guard(self):
with framework.program_guard( with framework.program_guard(
......
...@@ -153,7 +153,7 @@ def block_to_code(block, block_idx): ...@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):
indent += 1 indent += 1
# sort all vars # sort all vars
all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0]) all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
for var in all_vars: for var in all_vars:
print("{}{}".format(get_indent_space(indent), variable_to_code(var[1]))) print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))
......
...@@ -300,7 +300,7 @@ class DistributeTranspiler(object): ...@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
input_deps = grad_name_to_send_dummy_out.values() input_deps = grad_name_to_send_dummy_out.values()
program.global_block().append_op( program.global_block().append_op(
type="send_barrier", type="send_barrier",
inputs={"X": input_deps}, inputs={"X": list(input_deps)},
outputs={"Out": send_barrier_out}, outputs={"Out": send_barrier_out},
attrs={ attrs={
"endpoints": pserver_endpoints, "endpoints": pserver_endpoints,
...@@ -401,7 +401,7 @@ class DistributeTranspiler(object): ...@@ -401,7 +401,7 @@ class DistributeTranspiler(object):
Args: Args:
recv_vars (list): Variable list to recv for current trainer_id recv_vars (list): Variable list to recv for current trainer_id
eplist (list): A list of strings indicating eplist (list): A list of strings indicating
Returns: Returns:
Program: trainer side startup program. Program: trainer side startup program.
...@@ -455,7 +455,7 @@ class DistributeTranspiler(object): ...@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
# NOTE: if enable memory optimization, origin vars maybe removed. # NOTE: if enable memory optimization, origin vars maybe removed.
if startup_program.global_block().vars.has_key(varname): if varname in startup_program.global_block().vars:
orig_param = startup_program.global_block().vars[varname] orig_param = startup_program.global_block().vars[varname]
else: else:
origin_param_var = self.origin_program.global_block().vars[ origin_param_var = self.origin_program.global_block().vars[
...@@ -690,7 +690,7 @@ class DistributeTranspiler(object): ...@@ -690,7 +690,7 @@ class DistributeTranspiler(object):
Args: Args:
endpoint (str): current pserver endpoint. endpoint (str): current pserver endpoint.
Returns: Returns:
tuple: (main_program, startup_program), of type "Program" tuple: (main_program, startup_program), of type "Program"
""" """
...@@ -713,7 +713,7 @@ class DistributeTranspiler(object): ...@@ -713,7 +713,7 @@ class DistributeTranspiler(object):
endpoint (str): current pserver endpoint. endpoint (str): current pserver endpoint.
pserver_program (Program): deprecated, call get_pserver_program first. pserver_program (Program): deprecated, call get_pserver_program first.
startup_program (Program): deprecated, should pass startup_program startup_program (Program): deprecated, should pass startup_program
when initalizing when initalizing
Returns: Returns:
Program: parameter server side startup program. Program: parameter server side startup program.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册