diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index f91381fe729034b3e2d36068dce43d531bfedc1c..1bd6068d3fb961f3f35456f220d76c416614e49f 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -14,7 +14,7 @@ #include "paddle/infrt/dialect/phi/ir/phi_base.h" -#include +#include #include #include #include diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h index be363e77848a5c0b22616257cb969f5f22f3cb6f..2a242ca285ba8cf3e9e4257950c6137fb9e5d225 100644 --- a/paddle/infrt/dialect/tensorrt/convert.h +++ b/paddle/infrt/dialect/tensorrt/convert.h @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index 2682a744bb056f6da46d6a959112d54d44249fb2..dcb84ceb50edf91956664d21c4ad804e0aac0281 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -87,7 +87,7 @@ int main(int argc, char** argv) { std::cout << "\npass failed!\n" << std::endl; return 4; } - // module->dump(); + module->dump(); ::infrt::host_context::TestMlir(module.get(), ®istry); return 0; } diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc index 59a73e71083286b81f2bbdfa20a4ed96a8353a2f..4d588a9c2b523c6941fe1ed36ba8473307005930 100644 --- a/paddle/infrt/host_context/op_executable.cc +++ b/paddle/infrt/host_context/op_executable.cc @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/infrt/host_context/kernel_frame.h" #include "paddle/infrt/host_context/kernel_registry.h" @@ -71,7 +72,15 @@ OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name, // TODO(Superjomn) support other device other than CPU. CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name; - if (op_name == "dt.get_param") { + // TODO(wilber): Maybe we can use the MLIR trait or other facilities to remove + // the run_once set. + std::unordered_set run_once_set{ + "dt.get_param", + "trt.create_engine", + "phi_dt.create_host_inited_dense_tensor.f32", + "phi_dt.create_context.cpu", + "phi_dt.create_context.gpu"}; + if (run_once_set.count(op_name)) { impl_->run_once = true; } } diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index fe1cda0e10028b45e0b002074a1cd52312fae57e..7ffc8de15107563f778ea00a1a517d13b02e0938 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -22,6 +22,7 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" #ifdef INFRT_WITH_GPU #include @@ -308,34 +309,50 @@ inline size_t SizeOfDataType(::phi::DataType data_type) { } return 0; } -::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, - const ::phi::GPUContext& context, - bool d2h) { +void GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h, + ::phi::DenseTensor* output) { if (d2h) { - ::phi::DenseTensor ret( - const_cast<::phi::Allocator*>(&context.GetHostAllocator()), - input.meta()); CHECK(input.place().GetType() == ::phi::AllocationType::GPU); - // TODO(wilber): Add sync op and stream. - cudaMemcpyAsync(ret.data(), + + // TODO(wilber): Just a trick to avoid malloc. + if (input.numel() > output->numel()) { + // TODO(wilber): Use pinned memory. + output->Resize(input.dims()); + context.HostAlloc( + output, input.dtype(), input.numel() * SizeOfDataType(input.dtype())); + } + + cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyDeviceToHost, - nullptr); - return ret; + context.stream()); + // TODO(wilber): Ir add sync op. + cudaStreamSynchronize(context.stream()); } else { // h2d - ::phi::DenseTensor ret( - const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta()); CHECK(input.place().GetType() == ::phi::AllocationType::CPU || input.place().GetType() == ::phi::AllocationType::GPUPINNED); + + if (input.numel() > output->numel()) { + output->Resize(input.dims()); + context.Alloc(output, + input.dtype(), + input.numel() * SizeOfDataType(input.dtype()), + false); + + } else { + output->Resize(input.dims()); + } + // TODO(wilber): Add sync op and stream. - cudaMemcpyAsync(ret.data(), + cudaMemcpyAsync(output->data(), input.data(), SizeOfDataType(input.dtype()) * input.numel(), cudaMemcpyHostToDevice, - nullptr); - return ret; + context.stream()); } } #endif diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index b1075444731b524156a5984713b9887b81b94437..c401fb99978a364d36bce63cffe9cabe4cfc221b 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -76,9 +76,10 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor); int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map); #ifdef INFRT_WITH_GPU -::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input, - const ::phi::GPUContext& context, - bool d2h); +void GpuMemCpy(const ::phi::DenseTensor& input, + const ::phi::GPUContext& context, + bool d2h, + ::phi::DenseTensor* output); #endif } // namespace phi diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 65e137472b3d6225cff990afd2d97384d95adae7..2e952e77d1f0aab9d427096bced62bc607dd81a0 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -119,6 +119,7 @@ void NaiveMatmul(const DenseHostTensor &x, const int N = w.shape().GetDim(1); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { + out_data[i * N + j] = 0; for (int k = 0; k < K; k++) { out_data[i * N + j] += x_data[i * K + k] * w_data[k * N + j]; } @@ -134,9 +135,11 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { {"shape"}); registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor)); registry->AddKernel("dt.fill_tensor_with_constant.f32", - INFRT_KERNEL(FillTensorWithConstant)); + INFRT_KERNEL(FillTensorWithConstant), + {"value"}); registry->AddKernel("dt.fill_tensor_with_constant.f64", - INFRT_KERNEL(FillTensorWithConstant)); + INFRT_KERNEL(FillTensorWithConstant), + {"value"}); // TensorMap related methods. registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams)); diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index c182dda2705fd82f545c45123f43d7e30195b646..c0f5ebb4a7657619aea82ef40ee5602a5b90f5d0 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -57,7 +57,7 @@ namespace tensorrt { // TODO(wilber): The build option shoule be fiiled from mlir info. backends::tensorrt::BuildOptions options; options.max_batch = 4; - options.workspace = 1024; + options.workspace = 128; // Parse mlir Region which only has one block. mlir::Operation& operation = *create_engine_op.operation; diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h index 9d8eba0bb31f59486fe1eff135ca4323672ebfc9..0f2c2c88ca097e8a4fabade2c6cf6f9ce5f9875b 100644 --- a/paddle/infrt/kernel/tensorrt/trt_layers.h +++ b/paddle/infrt/kernel/tensorrt/trt_layers.h @@ -115,6 +115,27 @@ inline void PoolFunc(trt::PoolingOp& op, // NOLINT // TODO(Inference) // CHECK(false) << "Not supported adaptive pool"; + // TODO(wilber): Reformat. + // global average pooling. + auto ksize_vec = ArrayAttrToVec(ksize); + if (static_cast(pool_type) == + nvinfer1::PoolingType::kAVERAGE && + ksize_vec.size() == 2 && ksize_vec[0] == 1 && ksize_vec[1] == 1) { + nvinfer1::Dims dims; + dims.nbDims = 2; + dims.d[0] = input_shape.d[1]; + dims.d[1] = input_shape.d[2]; + auto* layer = network->addPoolingNd( + *input_itensor, static_cast(pool_type), dims); + CHECK_NOTNULL(layer); + + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; + return; + } + + // plugin... std::vector input_shape_v; for (int i = 0; i < input_dims; i++) { input_shape_v.push_back(input_shape.d[i]);