diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 296607b6ec605d652c42c018eb61063cb7d38fc2..eb916cbbb53aab15efa3200f45d8d4e4679d14f2 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -62,6 +62,7 @@ #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/data_type_transform.h" #include "paddle/utils/string/split.h" #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) @@ -1890,16 +1891,16 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { void AnalysisPredictor::CollectShapeRangeInfo() { // if use gpu, sync first. + paddle::platform::DeviceContextPool &pool = + paddle::platform::DeviceContextPool::Instance(); if (config_.use_gpu()) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - paddle::platform::DeviceContextPool &pool = - paddle::platform::DeviceContextPool::Instance(); - auto gpu_place = place_; - auto *dev_ctx = static_cast(pool.Get(gpu_place)); + auto *dev_ctx = pool.Get(place_); + auto stream = static_cast(dev_ctx)->stream(); #ifdef PADDLE_WITH_HIP - hipStreamSynchronize(dev_ctx->stream()); + hipStreamSynchronize(stream); #else - cudaStreamSynchronize(dev_ctx->stream()); + cudaStreamSynchronize(stream); #endif #endif } @@ -1911,6 +1912,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() { continue; } auto tensor = var->Get(); + if (!tensor.initialized()) continue; framework::DDim dim = tensor.dims(); std::vector shape(dim.size()); for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i]; @@ -1922,22 +1924,40 @@ void AnalysisPredictor::CollectShapeRangeInfo() { // This is a simple method to identify all shape tensors with some // mistakes, but it doesn't matter. auto is_shape_tensor = tensor.numel() <= 7 && tensor.numel() >= 1; - if (tensor.dtype() == paddle::experimental::DataType::INT32 && + if ((tensor.dtype() == phi::DataType::INT32 || + tensor.dtype() == phi::DataType::INT64) && is_shape_tensor) { std::vector int32_host(tensor.numel()); - if (tensor.place() == platform::CPUPlace()) { + + if (platform::is_cpu_place(tensor.place())) { + auto &int32_tensor = tensor; + if (tensor.dtype() == phi::DataType::INT64) { + auto *cpu_ctx = pool.Get(platform::CPUPlace()); + int32_tensor = phi::funcs::TransDataType( + reinterpret_cast(*cpu_ctx), + tensor, + DataType::INT32); + } paddle::memory::Copy(platform::CPUPlace(), int32_host.data(), platform::CPUPlace(), - tensor.data(), - tensor.numel() * sizeof(int)); - } else if (tensor.place() == platform::CUDAPlace()) { + int32_tensor.data(), + int32_tensor.numel() * sizeof(int)); + } else if (platform::is_gpu_place(tensor.place())) { #if defined(PADDLE_WITH_CUDA) + auto *dev_ctx = pool.Get(tensor.place()); + auto &int32_tensor = tensor; + if (tensor.dtype() == phi::DataType::INT64) { + int32_tensor = phi::funcs::TransDataType( + reinterpret_cast(*dev_ctx), + tensor, + DataType::INT32); + } paddle::memory::Copy(platform::CPUPlace(), int32_host.data(), - platform::CUDAPlace(), - tensor.data(), - tensor.numel() * sizeof(int), + int32_tensor.place(), + int32_tensor.data(), + int32_tensor.numel() * sizeof(int), nullptr); #endif } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 87323534bbaf0d8a504c44860da3562f7577cb8f..9c46cb5c3c96cfbe5830e19188ace1b11cb158b1 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -544,6 +544,7 @@ class TensorRTEngineOp : public framework::OperatorBase { "index=%d >= total inputs and outputs=%d", bind_index, num_bindings)); + auto type = framework::TransToProtoVarType(t.dtype()); if (!engine->with_dynamic_shape()) { // check if the input shapes are consistent with model. if (HasAttr(x + "_shape")) { @@ -586,12 +587,27 @@ class TensorRTEngineOp : public framework::OperatorBase { if (engine->engine()->isShapeBinding(bind_index) && engine->engine()->bindingIsInput(bind_index)) { std::vector shape_v(t.numel()); - paddle::memory::Copy(platform::CPUPlace(), - shape_v.data(), - platform::CUDAPlace(), - t.data(), - t.numel() * sizeof(int), - nullptr); + if (type == framework::proto::VarType::INT32) { + paddle::memory::Copy(platform::CPUPlace(), + shape_v.data(), + t.place(), + t.data(), + t.numel() * sizeof(int), + nullptr); + } else if (type == framework::proto::VarType::INT64) { + auto int32_tensor = scope.FindVar(x + "_cast_to_INT32") + ->GetMutable(); + *int32_tensor = phi::Cast( + reinterpret_cast(dev_ctx), + t, + phi::DataType::INT32); + paddle::memory::Copy(platform::CPUPlace(), + shape_v.data(), + int32_tensor->place(), + int32_tensor->data(), + int32_tensor->numel() * sizeof(int), + nullptr); + } trt_context->setInputShapeBinding(bind_index, shape_v.data()); } #endif @@ -608,7 +624,6 @@ class TensorRTEngineOp : public framework::OperatorBase { "The TRT Engine OP's input type should equal " "to the input data type")); - auto type = framework::TransToProtoVarType(t.dtype()); if (type == framework::proto::VarType::FP32) { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::INT64) { diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 8734750400ac6bb807201e888795a296239ff4ab..5214e462b98eaf14f385779cb21edba892a14a25 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -145,6 +145,13 @@ variance : Variance scale : Scale bias : Bias + outputs : + out : Y + mean_out: MeanOut + variance_out: VarianceOut + saved_mean: SavedMean + saved_variance: SavedVariance + reserve_space: ReserveSpace extra : attrs : [bool use_mkldnn = false, bool fuse_with_relu = false] @@ -407,6 +414,17 @@ - op : dropout backward : dropout_grad + inputs : + x : X + outputs : + out : Out + mask : Mask + attrs : + p : dropout_prob + is_test : is_test + mode : dropout_implementation + seed : seed + fix_seed : fix_seed extra : attrs : [bool fix_seed = false, int seed = 0] @@ -783,6 +801,14 @@ - op : layer_norm backward : layer_norm_grad + inputs : + x : X + scale : Scale + bias : Bias + outputs : + out : Y + mean : Mean + variance : Variance extra : attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false] @@ -933,6 +959,17 @@ outputs : out : Out +- op : mean (reduce_mean) + backward : reduce_mean_grad + inputs : + x : X + outputs : + out : Out + attrs : + {axis : dim, keepdim : keep_dim} + extra : + attrs : [bool use_mkldnn = false] + - op : meshgrid backward : meshgrid_grad inputs : @@ -1138,11 +1175,6 @@ extra : attrs : [bool use_mkldnn = false] -- op : reduce_mean - backward : reduce_mean_grad - extra : - attrs : [bool use_mkldnn = false] - - op : reduce_min backward : reduce_min_grad extra : diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 2d9b06c6050b33d1b07a67609e69a4f95342b51d..8cc09d1a9be1363374848b00155d898b950b2680 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -248,11 +248,8 @@ XPUOpMap& get_kl2_ops() { phi::DataType::INT16, phi::DataType::UINT8, phi::DataType::BOOL, - phi::DataType::FLOAT64, phi::DataType::FLOAT32, - phi::DataType::FLOAT16, - phi::DataType::COMPLEX64, - phi::DataType::COMPLEX128})}, + phi::DataType::FLOAT16})}, {"flatten2_grad", XPUKernelSet({phi::DataType::INT64, phi::DataType::INT32, diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc index 14987bc61b1593032ddd0c3c59c6e459656969e5..a492c1c304bd2f631f9c107c659eebdaefa82e6b 100644 --- a/paddle/phi/kernels/selected_rows/full_kernel.cc +++ b/paddle/phi/kernels/selected_rows/full_kernel.cc @@ -70,3 +70,17 @@ PD_REGISTER_KERNEL(full_sr, phi::dtype::complex, phi::dtype::complex) {} #endif + +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) +PD_REGISTER_KERNEL(full_sr, + XPU, + ALL_LAYOUT, + phi::sr::FullKernel, + float, + uint8_t, + int16_t, + int, + int64_t, + bool, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc index 44c5842210b71b5f1705c9b16a11f8800170053d..ae080d0dad07253f37adc2021c3c9606020dda3d 100644 --- a/paddle/phi/kernels/xpu/full_kernel.cc +++ b/paddle/phi/kernels/xpu/full_kernel.cc @@ -14,6 +14,7 @@ #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" @@ -59,8 +60,19 @@ void FullKernel(const Context& dev_ctx, const Scalar& val, DataType dtype, DenseTensor* out) { + using XPUInTDType = typename XPUTypeTrait::Type; out->Resize(phi::make_ddim(shape.GetData())); - FullValueXPU(dev_ctx, out, val.to()); + int numel = out->numel(); + dev_ctx.template Alloc(out); + auto value = val.to(); + auto out_data = reinterpret_cast(out->data()); + if (numel > 0) { + int r = xpu::constant(dev_ctx.x_context(), + out_data, + out->numel(), + static_cast(value)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + } } template @@ -103,16 +115,11 @@ void FullLikeKernel(const Context& dev_ctx, phi::errors::InvalidArgument("The filled value is Inf.")); auto out_data = reinterpret_cast(out->data()); - int ret = xpu::constant(dev_ctx.x_context(), - out_data, - out->numel(), - static_cast(value)); - PADDLE_ENFORCE_EQ( - ret, - XPU_SUCCESS, - phi::errors::External("XPU CONSTANT API return wrong value[%d %s].", - ret, - XPUAPIErrorMsg[ret])); + int r = xpu::constant(dev_ctx.x_context(), + out_data, + out->numel(), + static_cast(value)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); } } // namespace phi @@ -122,24 +129,23 @@ PD_REGISTER_KERNEL(full, ALL_LAYOUT, phi::FullKernel, float, - double, uint8_t, int16_t, int, int64_t, bool, - phi::dtype::float16, - phi::dtype::bfloat16, - phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::float16) {} PD_REGISTER_KERNEL(full_like, XPU, ALL_LAYOUT, phi::FullLikeKernel, float, + uint8_t, + int16_t, int, int64_t, + bool, phi::dtype::float16) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index fa8357f7611b11749ad47022257e022d9d62153d..96a0cfb3fb0e89d97f84672feb826d21c6f8cfcd 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -135,6 +135,7 @@ if(WITH_GPU AND TENSORRT_FOUND) #set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200) set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120) set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60) + set_tests_properties(test_trt_inference_predictor PROPERTIES TIMEOUT 60) if(WITH_NV_JETSON) set_tests_properties( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inference_predictor.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inference_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..561502b6250a0cffe2a9983e76a1e3f256941c3c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inference_predictor.py @@ -0,0 +1,399 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import sys +import tempfile +import unittest + +import numpy as np +import yaml + +import paddle +import paddle.nn as nn + +try: + import paddle.inference as paddle_infer +except Exception as e: + sys.stderr.write("Cannot import paddle, maybe paddle is not installed.\n") + +paddle.set_device('cpu') +paddle.disable_signal_handler() + + +def str2bool(v): + if v.lower() == 'true': + return True + else: + return False + + +def getdtype(dtype="float32"): + if dtype == "float32" or dtype == "float": + return np.float32 + if dtype == "float16": + return np.float16 + if dtype == "float64": + return np.float64 + if dtype == "int32": + return np.int32 + if dtype == "int64": + return np.int64 + + +class BackendPaddle: + def __init__(self): + super(BackendPaddle, self).__init__() + self.h2d_time = [] + self.compute_time = [] + self.d2h_time = [] + + def version(self): + return paddle.version.full_version + + def name(self): + return "paddle" + + def load(self, config_arg, inputs=None, outpus=None): + self.args = config_arg + if os.path.exists(self.args.model_dir): + model_file = os.path.join( + self.args.model_dir + "/" + self.args.paddle_model_file + ) + model_params = os.path.join( + self.args.model_dir + "/" + self.args.paddle_params_file + ) + config = paddle_infer.Config(model_file, model_params) + else: + raise ValueError( + f"The model dir {self.args.model_dir} does not exists!" + ) + + # enable memory optim + if not self.args.enable_tune: + config.enable_memory_optim() + + config.set_cpu_math_library_num_threads(self.args.cpu_threads) + config.switch_ir_optim(True) + # debug + if self.args.enable_debug: + config.switch_ir_debug() + precision_mode = paddle_infer.PrecisionType.Float32 + if self.args.precision == 'fp16': + precision_mode = paddle_infer.PrecisionType.Half + elif self.args.precision == 'int8': + precision_mode = paddle_infer.PrecisionType.Int8 + + if self.args.enable_mkldnn and not self.args.enable_gpu: + config.disable_gpu() + config.enable_mkldnn() + if self.args.precision == 'int8': + config.enable_mkldnn_int8( + {"conv2d", "depthwise_conv2d", "transpose2", "pool2d"} + ) + if not self.args.enable_mkldnn and not self.args.enable_gpu: + config.disable_gpu() + # config.enable_mkldnn() + if self.args.enable_profile: + config.enable_profile() + shape_range_file = os.path.join( + self.args.model_dir, self.args.shape_range_file + ) + if self.args.enable_tune: + config.collect_shape_range_info(shape_range_file) + if self.args.enable_gpu: + config.enable_use_gpu(256, self.args.gpu_id) + if self.args.enable_trt: + max_batch_size = self.args.batch_size + if ( + self.args.yaml_config["input_shape"]["0"]["shape"][ + self.args.test_num + ][0] + != -1 + ): + max_batch_size = self.args.yaml_config["input_shape"]["0"][ + "shape" + ][self.args.test_num][0] + config.enable_tensorrt_engine( + workspace_size=1 << 33, + precision_mode=precision_mode, + max_batch_size=max_batch_size, + min_subgraph_size=self.args.subgraph_size, + use_static=False, + use_calib_mode=False + if self.args.precision == 'int8' + else False, + ) + if self.args.enable_dynamic_shape: + if os.path.exists(shape_range_file): + config.enable_tuned_tensorrt_dynamic_shape( + shape_range_file, True + ) + config.disable_glog_info() + config.exp_disable_tensorrt_ops(["range"]) + + self.predictor = paddle_infer.create_predictor(config) + + input_shape = self.args.yaml_config["input_shape"] + if len(input_shape) <= 0: + raise Exception("input shape is empty.") + + if "input_data" in self.args.yaml_config: + input_file = self.args.yaml_config["input_data"]["data"][ + self.args.test_num + ] + self.numpy_input = np.load(input_file, allow_pickle=True) + + return self + + def set_input(self): + # set input tensor + input_names = self.predictor.get_input_names() + for i, name in enumerate(input_names): + input_tensor = self.predictor.get_input_handle(name) + if "input_data" not in self.args.yaml_config: + if ( + self.args.yaml_config["input_shape"][str(i)]["shape"][ + self.args.test_num + ][0] + == -1 + ): + input_shape = [ + self.args.batch_size + ] + self.args.yaml_config["input_shape"][str(i)]["shape"][ + self.args.test_num + ][ + 1: + ] + dtype = self.args.yaml_config["input_shape"][str(i)][ + "dtype" + ][self.args.test_num] + else: + input_shape = self.args.yaml_config["input_shape"][str(i)][ + "shape" + ][self.args.test_num] + dtype = self.args.yaml_config["input_shape"][str(i)][ + "dtype" + ][self.args.test_num] + if hasattr(self.args, "test_data"): + fake_input = self.args.test_data[i].astype(getdtype(dtype)) + else: + fake_input = np.ones(input_shape, dtype=getdtype(dtype)) + input_tensor.copy_from_cpu(fake_input) + else: + real_input = np.expand_dims(self.numpy_input[i], 0).repeat( + self.args.batch_size, axis=0 + ) + input_tensor.copy_from_cpu(real_input) + + def set_output(self): + results = [] + # get out data from output tensor + output_names = self.predictor.get_output_names() + for i, name in enumerate(output_names): + output_tensor = self.predictor.get_output_handle(name) + output_data = output_tensor.copy_to_cpu() + if self.args.return_result or self.args.save_result: + results.append(output_data) + if self.args.return_result or self.args.save_result: + return results + + def reset(self): + self.h2d_time.clear() + self.d2h_time.clear() + self.compute_time.clear() + + def warmup(self): + pass + + def predict(self, feed=None): + self.set_input() + self.predictor.run() + output = self.set_output() + if self.args.return_result or self.args.save_result: + return output + + def predict_nocopy(self, feed=None): + self.predictor.run() + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--batch_size', type=int, default=1) + parser.add_argument('--cpu_threads', type=int, default=1) + parser.add_argument('--inter_op_threads', type=int, default=1) + parser.add_argument( + '--precision', type=str, choices=["fp32", "fp16", "int8"] + ) + parser.add_argument( + '--backend_type', + type=str, + choices=["paddle", "onnxruntime", "openvino", "tensorrt"], + default="paddle", + ) + parser.add_argument('--gpu_id', type=int, default=0) + parser.add_argument('--subgraph_size', type=int, default=1) + parser.add_argument('--model_dir', type=str) + parser.add_argument( + '--paddle_model_file', type=str, default="model.pdmodel" + ) + parser.add_argument( + '--paddle_params_file', type=str, default="model.pdiparams" + ) + parser.add_argument('--enable_mkldnn', type=str2bool, default=False) + parser.add_argument('--enable_gpu', type=str2bool, default=True) + parser.add_argument('--enable_trt', type=str2bool, default=True) + parser.add_argument('--enable_dynamic_shape', type=str2bool, default=True) + parser.add_argument('--enable_tune', type=str2bool, default=False) + parser.add_argument('--enable_profile', type=str2bool, default=False) + parser.add_argument('--enable_benchmark', type=str2bool, default=True) + parser.add_argument('--save_result', type=str2bool, default=False) + parser.add_argument('--return_result', type=str2bool, default=False) + parser.add_argument('--enable_debug', type=str2bool, default=False) + parser.add_argument( + '--config_file', type=str, required=False, default="config/model.yaml" + ) + parser.add_argument( + '--shape_range_file', type=str, default="shape_range.pbtxt" + ) + args, unknown = parser.parse_known_args() + return args + + +def run_infer(model_path): + conf = parse_args() + + yaml_config = yaml.safe_load( + ''' + input_shape: + '0': + dtype: [float32] + shape: + - [-1, 3, 32, 32] + ''' + ) + + conf.yaml_config = yaml_config + conf.test_num = 0 + conf.model_dir = model_path + + conf.enable_tune = True + # collect shape use CPU + conf.enable_gpu = False + backend = BackendPaddle() + backend.load(conf) + backend.predict() + + # collect shape use GPU + conf.enable_gpu = True + backend = BackendPaddle() + backend.load(conf) + backend.predict() + + # run inference predictor + conf.enable_tune = False + backend = BackendPaddle() + backend.load(conf) + backend.predict() + + +class ConvBNLayer(paddle.nn.Layer): + def __init__( + self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + ): + super().__init__() + + self._conv = paddle.nn.Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + bias_attr=False, + ) + + self._batch_norm = paddle.nn.BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class Test(nn.Layer): + def __init__(self): + super(Test, self).__init__() + self.conv = ConvBNLayer( + num_channels=3, num_filters=64, filter_size=3, stride=2, act='relu' + ) + self.pool2d_max = paddle.nn.MaxPool2D( + kernel_size=3, stride=1, padding=1 + ) + self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(output_size=1) + + def forward(self, x): + x = self.conv(x) + x = self.pool2d_avg(x) + + x = paddle.reshape( + x, + shape=[ + paddle.to_tensor([-1], dtype=paddle.int64), + paddle.to_tensor([8], dtype=paddle.int64), + ], + ) + return x + + +class TestInferencePredictor(unittest.TestCase): + def setUp(self): + # enable dygraph mode + paddle.disable_static() + self.temp_dir = tempfile.TemporaryDirectory() + self.path = os.path.join(self.temp_dir.name, './inference/model') + self.path = "./inference/model" + + def tearDown(self): + self.temp_dir.cleanup() + + def SaveInferenceModel(self): + paddle.disable_static() + net = Test() + net.eval() + + net(paddle.rand(shape=[1, 3, 32, 32], dtype='float32')) + input_spec = [ + paddle.static.InputSpec( + shape=[-1, 3, 32, 32], dtype=paddle.float32, name='input' + ) + ] + + static_model = paddle.jit.to_static(net, input_spec=input_spec) + paddle.jit.save(static_model, self.path) + + def testInferencePredictor(self): + self.SaveInferenceModel() + run_infer(os.path.dirname(self.path)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py index 1d92d18cf347ac60a5b929247c83c2fbfd19da7e..1d00213b7bd296c44d36e0726c1e45121adf05fb 100644 --- a/python/paddle/incubate/autograd/composite_rules.py +++ b/python/paddle/incubate/autograd/composite_rules.py @@ -98,9 +98,9 @@ def composite_batchnorm( run_mean_ = assign(run_mean) run_var_ = assign(run_var) if trainable_statistics or not is_test: - return run_mean_, None, batch_mean_, batch_var_, run_var_, y + return y, run_mean_, run_var_, batch_mean_, batch_var_, None else: - return run_mean_, batch_mean_, batch_var_, run_var_, y + return y, run_mean_, run_var_, batch_mean_, batch_var_ @REGISTER_COMPOSITE('layer_norm') diff --git a/python/paddle/incubate/autograd/generate_op_map.py b/python/paddle/incubate/autograd/generate_op_map.py index d162789c226324096ff9c4eed95a5e2ff8ae1c74..34cef37c3cc995e10049b19a3fdfaab7b15f9fc4 100644 --- a/python/paddle/incubate/autograd/generate_op_map.py +++ b/python/paddle/incubate/autograd/generate_op_map.py @@ -84,7 +84,7 @@ def generate_code( else: op_name = key map_dct[op_name] = {"phi_name": op_name} - for element in ["inputs", "attrs"]: + for element in ["inputs", "outputs", "attrs"]: if element in item.keys(): map_dct[op_name][element] = item[element] for element in ["scalar", "int_array"]: diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py index 5e79128e568c4168c9b205cbf7fc6dd72222ebff..55ab90c27c65b2b758cbb7d82f2784b4a6ed757e 100644 --- a/python/paddle/incubate/autograd/primx.py +++ b/python/paddle/incubate/autograd/primx.py @@ -36,6 +36,7 @@ from .utils import ( flatten_and_remove_none, get_input_var_list, get_output_var_list, + get_output_vars_from_comosite, prepare_python_api_arguments, ) @@ -596,19 +597,37 @@ def _lower_composite(block, blacklist=[]): # if output var of composite rule is None, this means this var is not needed none_vars_to_remove = set() + change = None # Step2: Process all ops in the target block for op_idx in range(len(block.ops)): op = block.ops[op_idx] ops_to_remove.append(op_idx) if lookup_fn(op.type) is not None and op.type not in blacklist: + change = True + op_name = op.type input_args = prepare_python_api_arguments(op) bind(input_args, to_bind, value_table) + orig_outs = expand_nested_list( + get_output_vars_from_comosite(op) + ) + new_outs = expand_nested_list( + as_tensors(lower_fn(op, *input_args)) + ) + assert len(orig_outs) == len(new_outs), ( + f'when replace origin op {op_name} with composite rule, num of origin outs should be equal to new outs, ' + f'but len(orig_outs) = {len(orig_outs)} and len(new_outs) = {len(new_outs)}' + ) for orig_out, new_out in zip( - expand_nested_list(get_output_var_list(op)), - expand_nested_list(as_tensors(lower_fn(op, *input_args))), + orig_outs, + new_outs, ): if new_out is not None: + if orig_out.shape and new_out.shape: + assert orig_out.shape == new_out.shape, ( + f'when replace origin op {op_name} with composite rule, origin out shape should be equal to new out shape, ' + f'but orig_out.shape={orig_out.shape} and new_out.shape={new_out.shape}' + ) assert not (orig_out is None) ^ ( new_out is None ), "orig_out and new_out should match." @@ -675,6 +694,10 @@ def _lower_composite(block, blacklist=[]): block.desc._remove_var(var_name.encode()) del block.vars[var_name] block._sync_with_cpp() + + # composite ops may contain other composite ops, thus, call _lower_composite again. + if change: + _lower_composite(block, blacklist) return elif isinstance(block, typing.Sequence): diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py index 211851160b17fd6c148428c2983624fba3062e5f..fe7fa229ffa701fd3fda51c6214e4ad76dd170a1 100644 --- a/python/paddle/incubate/autograd/utils.py +++ b/python/paddle/incubate/autograd/utils.py @@ -169,6 +169,7 @@ def _get_args_values(op, phi_name): arg_type, arg_name = _solve_arg(item) op_content = op_map[op.type] if arg_type in ("Tensor", "Tensor[]"): + # assume Tensor type must belong to inputs if ( "inputs" in op_content.keys() and arg_name in op_content["inputs"].keys() @@ -182,8 +183,11 @@ def _get_args_values(op, phi_name): "attrs" in op_content.keys() and arg_name in op_content["attrs"].keys() ): - attrs.append(op.attr(op_content["attrs"][arg_name])) - attrs.append(op.attr(arg_name)) + arg_name = op_content["attrs"][arg_name] + if arg_name not in op.attr_names: + attrs.append(None) + else: + attrs.append(op.attr(arg_name)) return inputs, attrs @@ -202,7 +206,12 @@ def prepare_python_api_arguments(op): else: phi_name = op.type inputs, attrs = _get_args_values(op, phi_name) - res = [get_var_block(op.block, op.input(n)) for n in inputs] + res = [] + for item in inputs: + if item in op.input_names: + res.append(get_var_block(op.block, op.input(item))) + else: + res.append(None) if attrs: res.extend(attrs) return res @@ -218,6 +227,37 @@ def get_output_var_list(op): ] +def get_output_vars_from_comosite(op): + """origin op outputs must be mapped into outputs of composite rule.""" + origin_output_names = op.output_names + if origin_output_names is None: + return [] + else: + name = op.type + res = [] + if op_map[name].get("outputs"): + for item in op_map[name]["outputs"].keys(): + origin_output_name = op_map[name]["outputs"][item] + if origin_output_name not in origin_output_names: + # in some cases, some output of origin op is optional, so op name may not be in origin_output_names + continue + origin_output_var = get_var_block( + op.block, op.output(origin_output_name) + ) + res.append(origin_output_var) + elif len(origin_output_names) == 1: + # When origin output num is 1, map info is not needed. + origin_output_var = get_var_block( + op.block, op.output(origin_output_names[0]) + ) + res.append(origin_output_var) + else: + raise ValueError( + "When replace op with composite rule, there must exist output map info from origin op to composite rule." + ) + return res + + def flatten(inp): if inp is None or isinstance(inp, paddle.fluid.framework.Variable): return [inp]