Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
860f6077
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
860f6077
编写于
9月 19, 2022
作者:
S
sneaxiy
提交者:
GitHub
9月 19, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix broadcast kernel (#46158)
上级
e468e93c
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
515 addition
and
46 deletion
+515
-46
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+6
-5
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+2
-1
paddle/phi/backends/gpu/cuda/cuda_helper.h
paddle/phi/backends/gpu/cuda/cuda_helper.h
+6
-5
paddle/phi/backends/gpu/rocm/rocm_helper.h
paddle/phi/backends/gpu/rocm/rocm_helper.h
+2
-1
paddle/phi/kernels/funcs/broadcast_function.h
paddle/phi/kernels/funcs/broadcast_function.h
+448
-0
paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+1
-1
tools/dockerfile/Dockerfile.release16
tools/dockerfile/Dockerfile.release16
+6
-8
tools/dockerfile/Dockerfile.release18
tools/dockerfile/Dockerfile.release18
+4
-8
tools/dockerfile/Dockerfile.ubuntu
tools/dockerfile/Dockerfile.ubuntu
+6
-8
tools/dockerfile/Dockerfile.ubuntu18
tools/dockerfile/Dockerfile.ubuntu18
+4
-8
tools/dockerfile/build_scripts/build.sh
tools/dockerfile/build_scripts/build.sh
+1
-1
tools/dockerfile/build_scripts/install_patchelf.sh
tools/dockerfile/build_scripts/install_patchelf.sh
+29
-0
未找到文件。
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
浏览文件 @
860f6077
...
@@ -70,11 +70,12 @@ namespace platform {
...
@@ -70,11 +70,12 @@ namespace platform {
*
*
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
__index__ += blockDim.x * gridDim.x, i = __index__)
for (index_type i = __index__; __index__ < (num); \
__index__ += __stride__, i = __index__)
class
CublasHandleHolder
{
class
CublasHandleHolder
{
public:
public:
...
...
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
浏览文件 @
860f6077
...
@@ -70,8 +70,9 @@ namespace platform {
...
@@ -70,8 +70,9 @@ namespace platform {
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x; \
for (index_type i = __index__; __index__ < (num); \
for (index_type i = __index__; __index__ < (num); \
__index__ +=
hipBlockDim_x * hipGridDim_x
, i = __index__)
__index__ +=
__stride__
, i = __index__)
class
CublasHandleHolder
{
class
CublasHandleHolder
{
public:
public:
...
...
paddle/phi/backends/gpu/cuda/cuda_helper.h
浏览文件 @
860f6077
...
@@ -62,11 +62,12 @@ namespace gpu {
...
@@ -62,11 +62,12 @@ namespace gpu {
*
*
*/
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
__index__ += blockDim.x * gridDim.x, i = __index__)
for (index_type i = __index__; __index__ < (num); \
__index__ += __stride__, i = __index__)
}
// namespace gpu
}
// namespace gpu
}
// namespace backends
}
// namespace backends
...
...
paddle/phi/backends/gpu/rocm/rocm_helper.h
浏览文件 @
860f6077
...
@@ -65,8 +65,9 @@ namespace gpu {
...
@@ -65,8 +65,9 @@ namespace gpu {
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x; \
for (index_type i = __index__; __index__ < (num); \
for (index_type i = __index__; __index__ < (num); \
__index__ +=
hipBlockDim_x * hipGridDim_x
, i = __index__)
__index__ +=
__stride__
, i = __index__)
}
// namespace gpu
}
// namespace gpu
}
// namespace backends
}
// namespace backends
...
...
paddle/phi/kernels/funcs/broadcast_function.h
浏览文件 @
860f6077
...
@@ -468,6 +468,397 @@ void LaunchBroadcastKernel(
...
@@ -468,6 +468,397 @@ void LaunchBroadcastKernel(
func
);
func
);
}
}
#ifndef PADDLE_WITH_XPU_KP
HOSTDEVICE
static
int64_t
ConvertSrcIdxToDstIdx
(
int64_t
src_idx
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
src_strides
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
dst_strides
,
int
rank
)
{
int64_t
dst_idx
=
0
;
int64_t
old_src_idx
=
src_idx
;
for
(
int
k
=
0
;
k
<
rank
;
++
k
)
{
auto
local_idx
=
src_idx
/
src_strides
[
k
+
1
];
src_idx
-=
local_idx
*
src_strides
[
k
+
1
];
if
(
dst_strides
[
k
]
!=
dst_strides
[
k
+
1
])
{
dst_idx
+=
local_idx
*
dst_strides
[
k
+
1
];
}
}
return
dst_idx
;
}
template
<
typename
T
,
int
VecSize
,
bool
IsBoundary
>
HOSTDEVICE
static
void
ReadVecDataWithInt64Index
(
const
T
*
in
,
int64_t
idx
,
bool
need_broadcast
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
src_strides
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
dst_strides
,
int
rank
,
int
n
,
phi
::
AlignedVector
<
T
,
VecSize
>
*
out
)
{
if
(
IsBoundary
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
(
*
out
)[
i
]
=
in
[
ConvertSrcIdxToDstIdx
(
idx
+
i
,
src_strides
,
dst_strides
,
rank
)];
}
}
else
{
if
(
!
need_broadcast
)
{
phi
::
Load
<
T
,
VecSize
>
(
in
+
idx
,
out
);
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
(
*
out
)[
i
]
=
in
[
ConvertSrcIdxToDstIdx
(
idx
+
i
,
src_strides
,
dst_strides
,
rank
)];
}
}
}
}
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
,
int
NumIns
>
struct
ApplyFunctorWithInt64IndexHelper
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
);
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
0
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
());
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
1
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
(
ins_vec
[
0
][
i
]));
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
2
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
(
ins_vec
[
0
][
i
],
ins_vec
[
1
][
i
]));
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
3
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
(
ins_vec
[
0
][
i
],
ins_vec
[
1
][
i
],
ins_vec
[
2
][
i
]));
}
};
template
<
int
N
>
struct
MaxWithOne
{
static
constexpr
auto
kValue
=
(
N
>=
1
?
N
:
1
);
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
,
int
NumIns
>
__global__
void
BroadcastKernelWithInt64Index
(
phi
::
Array
<
const
InT
*
,
MaxWithOne
<
NumIns
>::
kValue
>
ins
,
OutT
*
out
,
phi
::
Array
<
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
,
MaxWithOne
<
NumIns
>::
kValue
>
ins_strides
,
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
out_strides
,
phi
::
Array
<
bool
,
MaxWithOne
<
NumIns
>::
kValue
>
need_broadcasts
,
int
rank
,
Functor
functor
)
{
int64_t
numel
=
out_strides
[
0
];
int64_t
idx
=
(
static_cast
<
int64_t
>
(
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
)
*
VecSize
;
int64_t
stride
=
static_cast
<
int64_t
>
(
blockDim
.
x
)
*
gridDim
.
x
*
VecSize
;
int64_t
limit
=
numel
-
VecSize
;
phi
::
Array
<
phi
::
AlignedVector
<
InT
,
VecSize
>
,
MaxWithOne
<
NumIns
>::
kValue
>
ins_vec
;
phi
::
AlignedVector
<
OutT
,
VecSize
>
out_vec
;
for
(;
idx
<=
limit
;
idx
+=
stride
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NumIns
;
++
i
)
{
ReadVecDataWithInt64Index
<
InT
,
VecSize
,
false
>
(
ins
[
i
],
idx
,
need_broadcasts
[
i
],
out_strides
,
ins_strides
[
i
],
rank
,
VecSize
,
&
ins_vec
[
i
]);
}
#pragma unroll
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
out_vec
[
i
]
=
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
NumIns
>::
Run
(
ins_vec
.
Get
(),
functor
,
i
);
}
phi
::
Store
<
OutT
,
VecSize
>
(
out_vec
,
out
+
idx
);
}
if
(
idx
<
numel
)
{
int
remain
=
numel
-
idx
;
// remain is always less than VecSize, therefore
// `int` is enough here
#pragma unroll
for
(
int
i
=
0
;
i
<
NumIns
;
++
i
)
{
ReadVecDataWithInt64Index
<
InT
,
VecSize
,
true
>
(
ins
[
i
],
idx
,
need_broadcasts
[
i
],
out_strides
,
ins_strides
[
i
],
rank
,
remain
,
&
ins_vec
[
i
]);
}
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
out
[
idx
+
i
]
=
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
NumIns
>::
Run
(
ins_vec
.
Get
(),
functor
,
i
);
}
}
}
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
>
struct
LaunchBroadcastKernelWithInt64IndexHelper
{
static
void
Run
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
axis
,
Functor
functor
)
{
PADDLE_THROW
(
phi
::
errors
::
PermissionDenied
(
"Unreachable code branch. This may be a bug."
));
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
Arity
,
int
VecSize
>
struct
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
Arity
,
/*NumOuts=*/
1
,
VecSize
>
{
static
void
Run
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
axis
,
Functor
functor
)
{
phi
::
Array
<
const
InT
*
,
MaxWithOne
<
Arity
>::
kValue
>
ins_ptrs
;
for
(
int
i
=
0
;
i
<
Arity
;
++
i
)
{
ins_ptrs
[
i
]
=
ins
[
i
]
->
data
<
InT
>
();
}
auto
*
out_tensor
=
(
*
outs
)[
0
];
auto
*
out_ptr
=
ctx
.
Alloc
<
OutT
>
(
out_tensor
);
phi
::
Array
<
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
>
,
MaxWithOne
<
Arity
>::
kValue
>
ins_expand_dims
;
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
>
broadcast_out_dims
;
int
rank
;
if
(
Arity
==
1
)
{
rank
=
ins
[
0
]
->
dims
().
size
();
for
(
int
i
=
0
;
i
<
rank
;
++
i
)
{
broadcast_out_dims
[
i
]
=
ins
[
0
]
->
dims
()[
i
];
}
ins_expand_dims
[
0
]
=
broadcast_out_dims
;
}
else
if
(
Arity
>=
2
)
{
CalculateBroadcastDims
(
ins
[
0
]
->
dims
().
Get
(),
ins
[
1
]
->
dims
().
Get
(),
ins
[
0
]
->
dims
().
size
(),
ins
[
1
]
->
dims
().
size
(),
axis
,
ins_expand_dims
[
0
].
GetMutable
(),
ins_expand_dims
[
1
].
GetMutable
(),
broadcast_out_dims
.
GetMutable
(),
&
rank
);
for
(
int
i
=
2
;
i
<
Arity
;
++
i
)
{
auto
tmp_dims
=
broadcast_out_dims
;
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
>
tmp_expand_dims
;
int
tmp_rank
;
PADDLE_ENFORCE_GE
(
rank
,
ins
[
i
]
->
dims
().
size
(),
phi
::
errors
::
InvalidArgument
(
"Unsupported reverse broadcast when the input "
"tensor number is larger than 2."
));
CalculateBroadcastDims
(
tmp_dims
.
Get
(),
ins
[
i
]
->
dims
().
Get
(),
rank
,
ins
[
i
]
->
dims
().
size
(),
axis
,
tmp_expand_dims
.
GetMutable
(),
ins_expand_dims
[
i
].
GetMutable
(),
broadcast_out_dims
.
GetMutable
(),
&
tmp_rank
);
PADDLE_ENFORCE_EQ
(
rank
,
tmp_rank
,
phi
::
errors
::
InvalidArgument
(
"Wrong broadcast algorithm. This may be a bug."
));
}
}
phi
::
Array
<
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
,
MaxWithOne
<
Arity
>::
kValue
>
ins_strides
;
phi
::
Array
<
bool
,
MaxWithOne
<
Arity
>::
kValue
>
need_broadcasts
;
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
out_strides
;
const
auto
&
out_dims
=
out_tensor
->
dims
();
if
(
rank
<=
out_dims
.
size
())
{
out_strides
=
ShapeToStride
(
out_dims
.
Get
(),
rank
);
}
else
{
out_strides
=
ShapeToStride
(
broadcast_out_dims
.
Get
(),
rank
);
}
for
(
int
i
=
0
;
i
<
Arity
;
++
i
)
{
ins_strides
[
i
]
=
ShapeToStride
(
ins_expand_dims
[
i
].
Get
(),
rank
);
need_broadcasts
[
i
]
=
!
IsSameShape
(
out_strides
.
Get
(),
ins_strides
[
i
].
Get
(),
rank
+
1
);
}
int64_t
numel
=
out_strides
[
0
];
auto
gpu_config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
BroadcastKernelWithInt64Index
<
InT
,
OutT
,
Functor
,
VecSize
,
Arity
>
<<<
gpu_config
.
block_per_grid
,
gpu_config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
ins_ptrs
,
out_ptr
,
ins_strides
,
out_strides
,
need_broadcasts
,
rank
,
functor
);
}
private:
static
void
CalculateBroadcastDims
(
const
int64_t
*
x_dims
,
const
int64_t
*
y_dims
,
int
nx
,
int
ny
,
int
axis
,
int64_t
*
x_out_dims
,
int64_t
*
y_out_dims
,
int64_t
*
broadcast_out_dims
,
int
*
length
)
{
PADDLE_ENFORCE_GE
(
axis
,
0
,
phi
::
errors
::
InvalidArgument
(
"Invalid axis value: %d"
,
axis
));
if
(
nx
==
ny
)
{
*
length
=
nx
;
for
(
int
i
=
0
;
i
<
nx
;
++
i
)
{
if
(
x_dims
[
i
]
!=
y_dims
[
i
])
{
PADDLE_ENFORCE_EQ
(
x_dims
[
i
]
==
1
||
y_dims
[
i
]
==
1
,
true
,
phi
::
errors
::
InvalidArgument
(
"Cannot broadcast input shape where "
"x_dims[%d] = %d, y_dims[%d] = %d."
,
i
,
x_dims
[
i
],
i
,
y_dims
[
i
]));
}
broadcast_out_dims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
]);
x_out_dims
[
i
]
=
x_dims
[
i
];
y_out_dims
[
i
]
=
y_dims
[
i
];
}
}
else
if
(
nx
>
ny
)
{
*
length
=
nx
;
for
(
int
i
=
nx
-
axis
;
i
<
ny
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
y_dims
[
i
],
1
,
phi
::
errors
::
InvalidArgument
(
"The trailing Y.shape[%d] should be 1 but got %d."
,
i
,
y_dims
[
i
]));
}
for
(
int
i
=
0
;
i
<
nx
;
++
i
)
{
if
(
i
>=
axis
&&
i
-
axis
<
ny
)
{
if
(
x_dims
[
i
]
!=
y_dims
[
i
-
axis
])
{
PADDLE_ENFORCE_EQ
(
x_dims
[
i
]
==
1
||
y_dims
[
i
-
axis
]
==
1
,
true
,
phi
::
errors
::
InvalidArgument
(
"Cannot broadcast input shape where "
"x_dims[%d] = %d, y_dims[%d] = %d."
,
i
,
x_dims
[
i
],
i
-
axis
,
y_dims
[
i
-
axis
]));
}
broadcast_out_dims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
-
axis
]);
x_out_dims
[
i
]
=
x_dims
[
i
];
y_out_dims
[
i
]
=
y_dims
[
i
-
axis
];
}
else
{
broadcast_out_dims
[
i
]
=
x_dims
[
i
];
x_out_dims
[
i
]
=
x_dims
[
i
];
y_out_dims
[
i
]
=
1
;
}
}
}
else
{
CalculateBroadcastDims
(
y_dims
,
x_dims
,
ny
,
nx
,
axis
,
y_out_dims
,
x_out_dims
,
broadcast_out_dims
,
length
);
}
}
static
bool
IsSameShape
(
const
int64_t
*
x
,
const
int64_t
*
y
,
int
rank
)
{
for
(
int
i
=
0
;
i
<
rank
;
++
i
)
{
if
(
x
[
i
]
!=
y
[
i
])
return
false
;
}
return
true
;
}
static
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
ShapeToStride
(
const
int64_t
*
arr
,
int
rank
)
{
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
strides
;
strides
[
rank
]
=
1
;
for
(
int
i
=
rank
-
1
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
arr
[
i
];
}
return
strides
;
}
};
#endif
template
<
ElementwiseType
ET
,
template
<
ElementwiseType
ET
,
typename
InT
,
typename
InT
,
typename
OutT
,
typename
OutT
,
...
@@ -509,6 +900,63 @@ void BroadcastKernelForDifferentVecSize(
...
@@ -509,6 +900,63 @@ void BroadcastKernelForDifferentVecSize(
outs
->
size
(),
outs
->
size
(),
NumOuts
));
NumOuts
));
#ifndef PADDLE_WITH_XPU_KP
constexpr
bool
kEnabledInt64IndexKernel
=
(
NumOuts
==
1
&&
kArity
<=
3
);
bool
use_int64_index_kernel
=
kEnabledInt64IndexKernel
&&
(
*
outs
)[
0
]
->
numel
()
>=
std
::
numeric_limits
<
int32_t
>::
max
();
if
(
use_int64_index_kernel
)
{
int
vec_size
=
GetVecsize
<
InT
,
OutT
>
(
ins
,
outs
);
switch
(
vec_size
)
{
case
VecSizeL
:
{
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeL
>::
Run
(
ctx
,
ins
,
outs
,
axis
,
func
);
break
;
}
case
VecSizeM
:
{
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeM
>::
Run
(
ctx
,
ins
,
outs
,
axis
,
func
);
break
;
}
case
VecSizeS
:
{
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeS
>::
Run
(
ctx
,
ins
,
outs
,
axis
,
func
);
break
;
}
default:
{
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"Unsupported vectorized size: %d!"
,
vec_size
));
break
;
}
}
return
;
}
#endif
// mergedim and get vec_size
// mergedim and get vec_size
const
auto
merge_dims
=
DimensionsTransform
(
ins
,
(
*
outs
)[
0
]
->
dims
(),
axis
);
const
auto
merge_dims
=
DimensionsTransform
(
ins
,
(
*
outs
)[
0
]
->
dims
(),
axis
);
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
kArity
>
configs
;
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
kArity
>
configs
;
...
...
paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
浏览文件 @
860f6077
...
@@ -92,7 +92,7 @@ struct BinaryOperation {
...
@@ -92,7 +92,7 @@ struct BinaryOperation {
std
::
vector
<
DenseTensor
*>
outs
{
output
};
std
::
vector
<
DenseTensor
*>
outs
{
output
};
paddle
::
operators
::
paddle
::
operators
::
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
-
1
,
BinaryFunctor
<
T
>
());
dev_ctx
,
ins
,
&
outs
,
0
,
BinaryFunctor
<
T
>
());
}
}
};
};
...
...
tools/dockerfile/Dockerfile.release16
浏览文件 @
860f6077
...
@@ -101,8 +101,13 @@ RUN curl -s -q https://glide.sh/get | sh
...
@@ -101,8 +101,13 @@ RUN curl -s -q https://glide.sh/get | sh
# Downgrade TensorRT
# Downgrade TensorRT
COPY tools/dockerfile/build_scripts /build_scripts
COPY tools/dockerfile/build_scripts /build_scripts
RUN bash /build_scripts/install_nccl2.sh
RUN bash /build_scripts/install_nccl2.sh
RUN rm -rf /build_scripts
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# # So install a newer version here.
RUN bash /build_scripts/install_patchelf.sh
RUN rm -rf /build_scripts
# git credential to skip password typing
# git credential to skip password typing
RUN git config --global credential.helper store
RUN git config --global credential.helper store
...
@@ -143,13 +148,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
...
@@ -143,13 +148,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
RUN apt-get install libprotobuf-dev -y
RUN apt-get install libprotobuf-dev -y
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
CMD source ~/.bashrc
CMD source ~/.bashrc
...
...
tools/dockerfile/Dockerfile.release18
浏览文件 @
860f6077
...
@@ -28,6 +28,10 @@ RUN apt-get update && \
...
@@ -28,6 +28,10 @@ RUN apt-get update && \
# Downgrade gcc&&g++
# Downgrade gcc&&g++
WORKDIR /usr/bin
WORKDIR /usr/bin
COPY tools/dockerfile/build_scripts /build_scripts
COPY tools/dockerfile/build_scripts /build_scripts
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN bash /build_scripts/install_patchelf.sh
RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts
RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts
RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
...
@@ -99,14 +103,6 @@ RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
...
@@ -99,14 +103,6 @@ RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
COPY ./python/requirements.txt /root/
COPY ./python/requirements.txt /root/
RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
#CMD source ~/.bashrc
#CMD source ~/.bashrc
...
...
tools/dockerfile/Dockerfile.ubuntu
浏览文件 @
860f6077
...
@@ -143,9 +143,14 @@ RUN curl -s -q https://glide.sh/get | sh
...
@@ -143,9 +143,14 @@ RUN curl -s -q https://glide.sh/get | sh
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
# Downgrade TensorRT
# Downgrade TensorRT
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
COPY tools/dockerfile/build_scripts /build_scripts
COPY tools/dockerfile/build_scripts /build_scripts
RUN bash /build_scripts/install_trt.sh && \
RUN bash /build_scripts/install_trt.sh && \
bash /build_scripts/install_nccl2.sh
bash /build_scripts/install_nccl2.sh && \
bash /build_scripts/install_patchelf.sh
RUN rm -rf /build_scripts
RUN rm -rf /build_scripts
# git credential to skip password typing
# git credential to skip password typing
...
@@ -236,13 +241,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
...
@@ -236,13 +241,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
RUN apt-get install libprotobuf-dev -y
RUN apt-get install libprotobuf-dev -y
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
CMD source ~/.bashrc
CMD source ~/.bashrc
...
...
tools/dockerfile/Dockerfile.ubuntu18
浏览文件 @
860f6077
...
@@ -35,6 +35,10 @@ RUN apt-get update --allow-unauthenticated && \
...
@@ -35,6 +35,10 @@ RUN apt-get update --allow-unauthenticated && \
WORKDIR /usr/bin
WORKDIR /usr/bin
COPY tools/dockerfile/build_scripts /build_scripts
COPY tools/dockerfile/build_scripts /build_scripts
RUN bash /build_scripts/install_trt.sh
RUN bash /build_scripts/install_trt.sh
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# # So install a newer version here.
RUN bash /build_scripts/install_patchelf.sh
RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts
RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts
RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
...
@@ -151,14 +155,6 @@ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
...
@@ -151,14 +155,6 @@ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
pip3.8 --no-cache-dir install -r /root/requirements.txt && \
pip3.8 --no-cache-dir install -r /root/requirements.txt && \
pip3.9 --no-cache-dir install -r /root/requirements.txt
pip3.9 --no-cache-dir install -r /root/requirements.txt
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
#CMD source ~/.bashrc
#CMD source ~/.bashrc
...
...
tools/dockerfile/build_scripts/build.sh
浏览文件 @
860f6077
...
@@ -106,7 +106,7 @@ export SSL_CERT_FILE=/opt/_internal/certs.pem
...
@@ -106,7 +106,7 @@ export SSL_CERT_FILE=/opt/_internal/certs.pem
# tar -xzf patchelf-0.9njs2.tar.gz
# tar -xzf patchelf-0.9njs2.tar.gz
# (cd patchelf-0.9njs2 && ./configure && make && make install)
# (cd patchelf-0.9njs2 && ./configure && make && make install)
# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
yum
install
-y
patchelf
sh
"
$MY_DIR
/install_patchelf.sh"
# Install latest pypi release of auditwheel
# Install latest pypi release of auditwheel
#LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
#LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
...
...
tools/dockerfile/build_scripts/install_patchelf.sh
0 → 100644
浏览文件 @
860f6077
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-e
TMP_DIR
=
patchelf_tmp
rm
-rf
"
$TMP_DIR
"
git clone
-b
0.15.0 https://github.com/NixOS/patchelf
"
$TMP_DIR
"
cd
"
$TMP_DIR
"
./bootstrap.sh
./configure
make
make
install
cd
..
rm
-rf
"
$TMP_DIR
"
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录