未验证 提交 c4a3e8b4 编写于 作者: F feng_shuai 提交者: GitHub

merge CMakeList.txt manual (#35378)

* merge CMakeList.txt manual

* add platform for changethreadnum

* repair some bugs according to make error

* do nothing just flush CI

* forget change thread num

* add inplace_atol param for check_output_with_place

* Windows

* std:min and std::max should be change because of windows
上级 5199c744
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_launch_config.h"
namespace paddle {
namespace operators {
......@@ -104,10 +105,14 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int col_width = col->dims()[4];
int num_outputs = im_channels * col_height * col_width;
int blocks = (num_outputs + 1024 - 1) / 1024;
int num_thread = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &num_thread);
#endif
int blocks = (num_outputs + num_thread - 1) / num_thread;
int block_x = 512;
int block_y = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1);
dim3 threads(num_thread, 1);
dim3 grid(block_x, block_y);
im2col<T><<<grid, threads, 0, context.stream()>>>(
im.data<T>(), num_outputs, im_height, im_width, dilation[0],
......@@ -228,10 +233,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
size_t num_kernels = im_channels * im_height * im_width;
size_t blocks = (num_kernels + 1024 - 1) / 1024;
int num_thread = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &num_thread);
#endif
size_t blocks = (num_kernels + num_thread - 1) / num_thread;
size_t block_x = 512;
size_t block_y = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1);
dim3 threads(num_thread, 1);
dim3 grid(block_x, block_y);
// To avoid involving atomic operations, we will launch one kernel per
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_launch_config.h"
namespace paddle {
namespace operators {
......@@ -254,8 +255,13 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
const int padding_width = paddings[1];
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
int thread_num = 1024;
#ifdef WITH_NV_JETSON
// platform::ChangeThreadNum(context, &thread_num);
thread_num = 512;
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
dim3 grid(blocks, 1);
KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
......@@ -298,10 +304,13 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
int thread_num = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
dim3 grid(blocks, 1);
KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
nthreads, input_data, input_channels, input_height, input_width,
output_height, output_width, ksize_height, ksize_width, stride_height,
......@@ -341,10 +350,13 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
int thread_num = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
dim3 grid(blocks, 1);
KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
nthreads, input_data, input_channels, input_height, input_width,
output_height, output_width, ksize_height, ksize_width, stride_height,
......@@ -911,8 +923,12 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
int nthreads = batch_size * output_channels * output_depth * output_height *
output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
int thread_num = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
dim3 grid(blocks, 1);
KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
......@@ -962,8 +978,12 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
int nthreads = batch_size * output_channels * output_depth * output_height *
output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
int thread_num = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
dim3 grid(blocks, 1);
KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
......@@ -1377,10 +1397,14 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
int thread_num = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
dim3 grid(blocks, 1);
KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
nthreads, input_data, input_channels, input_height, input_width,
output_height, output_width, ksize_height, ksize_width, stride_height,
......@@ -1613,8 +1637,13 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
int nthreads = batch_size * output_channels * output_depth * output_height *
output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
int thread_num = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &thread_num);
#endif
int blocks = (nthreads + thread_num - 1) / thread_num;
dim3 threads(thread_num, 1);
dim3 grid(blocks, 1);
KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_launch_config.h"
namespace paddle {
namespace operators {
......@@ -152,8 +153,14 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
int num_outputs =
input_channels * output_depth * output_height * output_width;
const int threads = 1024;
const int blocks = (num_outputs + 1024 - 1) / 1024;
int max_threads = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &max_threads);
#endif
const int threads = max_threads;
const int blocks = (num_outputs + max_threads - 1) / max_threads;
vol2col<T><<<blocks, threads, 0, context.stream()>>>(
num_outputs, vol.data<T>(), input_depth, input_height, input_width,
dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
......@@ -313,8 +320,13 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
int num_kernels = input_channels * input_depth * input_height * input_width;
const int threads = 1024;
const int blocks = (num_kernels + 1024 - 1) / 1024;
int max_threads = 1024;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(context, &max_threads);
#endif
const int threads = max_threads;
const int blocks = (num_kernels + max_threads - 1) / max_threads;
col2vol<T><<<blocks, threads, 0, context.stream()>>>(
num_kernels, col.data<T>(), input_depth, input_height, input_width,
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/roi_align_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_launch_config.h"
namespace paddle {
namespace operators {
......@@ -261,7 +262,9 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
int output_size = out->numel();
int blocks = NumBlocks(output_size);
int threads = kNumCUDAThreads;
#ifdef WITH_NV_JETSON
platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256);
#endif
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
auto cplace = platform::CPUPlace();
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/gpu_launch_config.h"
namespace paddle {
namespace platform {
......@@ -65,6 +66,11 @@ struct ForRange<CUDADeviceContext> {
#ifdef __HIPCC__
// HIP will throw core dump when threads > 256
constexpr int num_threads = 256;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
int num_thread = 256;
platform::ChangeThreadNum(dev_ctx_, &num_thread, 128);
const int num_threads = num_thread;
#else
constexpr int num_threads = 1024;
#endif
......
......@@ -23,6 +23,7 @@
#else
#include <hip/hip_runtime.h>
#endif
#include <stddef.h>
#include <algorithm>
#include <string>
......@@ -33,6 +34,18 @@ namespace platform {
inline int DivUp(int a, int b) { return (a + b - 1) / b; }
#ifdef WITH_NV_JETSON
// The number of threads cannot be assigned 1024 in some cases when the device
// is nano or tx2 .
inline void ChangeThreadNum(const platform::CUDADeviceContext& context,
int* num_thread, int alternative_num_thread = 512) {
if (context.GetComputeCapability() == 53 ||
context.GetComputeCapability() == 62) {
*num_thread = alternative_num_thread;
}
}
#endif
struct GpuLaunchConfig {
dim3 theory_thread_count = dim3(1, 1, 1);
dim3 thread_per_block = dim3(1, 1, 1);
......@@ -61,15 +74,22 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
// Compute physical threads we need, should small than max sm threads
const int physical_thread_count =
std::min(max_physical_threads, theory_thread_count);
(std::min)(max_physical_threads, theory_thread_count);
// Get compute_capability
const int capability = context.GetComputeCapability();
#ifdef WITH_NV_JETSON
if (capability == 53 || capability == 62) {
max_threads = 512;
}
#endif
// Need get from device
const int thread_per_block =
std::min(max_threads, context.GetMaxThreadsPerBlock());
(std::min)(max_threads, context.GetMaxThreadsPerBlock());
const int block_count =
std::min(DivUp(physical_thread_count, thread_per_block), sm);
// Get compute_capability
const int capability = context.GetComputeCapability();
(std::min)(DivUp(physical_thread_count, thread_per_block), sm);
GpuLaunchConfig config;
config.theory_thread_count.x = theory_thread_count;
......@@ -91,19 +111,20 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(
y_dim));
const int kThreadsPerBlock = 256;
int block_cols = std::min(x_dim, kThreadsPerBlock);
int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
int block_cols = (std::min)(x_dim, kThreadsPerBlock);
int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
int max_physical_threads = context.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);
const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
GpuLaunchConfig config;
// Noticed, block size is not align to 32, if needed do it yourself.
config.theory_thread_count = dim3(x_dim, y_dim, 1);
config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = std::min(DivUp(x_dim, block_cols), max_blocks);
int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
int grid_y =
(std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
config.block_per_grid = dim3(grid_x, grid_y, 1);
return config;
......
......@@ -753,7 +753,11 @@ endif()
if (NOT WIN32)
set_tests_properties(test_multiprocess_reader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
if (WITH_NV_JETSON)
set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT 1200)
else ()
set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT 120)
endif ()
endif()
if (WITH_DISTRIBUTE AND NOT WIN32)
......@@ -799,7 +803,11 @@ set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 120)
if(NOT WIN32)
if (WITH_NV_JETSON)
set_tests_properties(test_ir_memory_optimize_nlp PROPERTIES TIMEOUT 1200)
else ()
set_tests_properties(test_ir_memory_optimize_nlp PROPERTIES TIMEOUT 120)
endif ()
endif()
set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120)
set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120)
......@@ -822,12 +830,28 @@ else()
set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
endif()
if (WITH_NV_JETSON)
set_tests_properties(test_concat_op PROPERTIES TIMEOUT 1200)
set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 1200)
set_tests_properties(test_conv3d_transpose_op PROPERTIES TIMEOUT 1200)
set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 1200)
set_tests_properties(test_norm_op PROPERTIES TIMEOUT 1200)
set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 1500)
set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500)
else()
set_tests_properties(test_concat_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv3d_transpose_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
endif()
set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 120)
set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
set_tests_properties(test_concat_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
set_tests_properties(test_dropout_op PROPERTIES TIMEOUT 120)
......@@ -851,8 +875,6 @@ set_tests_properties(test_parallel_executor_mnist PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_ptb_rnn PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_save_load_v2 PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv2d_transpose_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv3d_transpose_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_prroi_pool_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES TIMEOUT 120)
set_tests_properties(test_lstm_cudnn_op PROPERTIES TIMEOUT 120)
......@@ -882,7 +904,6 @@ set_tests_properties(test_adam_optimizer_fp32_fp64 PROPERTIES TIMEOUT 120)
set_tests_properties(test_elementwise_nn_grad PROPERTIES TIMEOUT 120)
set_tests_properties(test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv_nn_grad PROPERTIES TIMEOUT 120)
set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_program_prune_backward PROPERTIES TIMEOUT 120)
set_tests_properties(test_group_norm_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120)
......@@ -902,13 +923,10 @@ set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120)
set_tests_properties(test_fuse_all_reduce_pass PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 270)
set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
set_tests_properties(test_imperative_resnet PROPERTIES TIMEOUT 200)
set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES TIMEOUT 200)
......
......@@ -49,7 +49,11 @@ set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
#set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
if(WITH_NV_JETSON)
set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
else()
set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
endif()
set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
set_tests_properties(test_trt_tile_op PROPERTIES TIMEOUT 60)
set_tests_properties(test_trt_fc_fuse_quant_dequant_pass PROPERTIES TIMEOUT 100)
......
......@@ -1349,7 +1349,8 @@ class OpTest(unittest.TestCase):
places = self._get_places()
for place in places:
res = self.check_output_with_place(place, atol, no_check_set,
equal_nan, check_dygraph)
equal_nan, check_dygraph,
inplace_atol)
if check_dygraph:
outs, dygraph_outs, fetch_list = res
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册