Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
860f6077
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
860f6077
编写于
9月 19, 2022
作者:
S
sneaxiy
提交者:
GitHub
9月 19, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix broadcast kernel (#46158)
上级
e468e93c
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
515 addition
and
46 deletion
+515
-46
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+6
-5
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+2
-1
paddle/phi/backends/gpu/cuda/cuda_helper.h
paddle/phi/backends/gpu/cuda/cuda_helper.h
+6
-5
paddle/phi/backends/gpu/rocm/rocm_helper.h
paddle/phi/backends/gpu/rocm/rocm_helper.h
+2
-1
paddle/phi/kernels/funcs/broadcast_function.h
paddle/phi/kernels/funcs/broadcast_function.h
+448
-0
paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+1
-1
tools/dockerfile/Dockerfile.release16
tools/dockerfile/Dockerfile.release16
+6
-8
tools/dockerfile/Dockerfile.release18
tools/dockerfile/Dockerfile.release18
+4
-8
tools/dockerfile/Dockerfile.ubuntu
tools/dockerfile/Dockerfile.ubuntu
+6
-8
tools/dockerfile/Dockerfile.ubuntu18
tools/dockerfile/Dockerfile.ubuntu18
+4
-8
tools/dockerfile/build_scripts/build.sh
tools/dockerfile/build_scripts/build.sh
+1
-1
tools/dockerfile/build_scripts/install_patchelf.sh
tools/dockerfile/build_scripts/install_patchelf.sh
+29
-0
未找到文件。
paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
浏览文件 @
860f6077
...
...
@@ -70,11 +70,12 @@ namespace platform {
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += __stride__, i = __index__)
class
CublasHandleHolder
{
public:
...
...
paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
浏览文件 @
860f6077
...
...
@@ -70,8 +70,9 @@ namespace platform {
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ +=
hipBlockDim_x * hipGridDim_x
, i = __index__)
__index__ +=
__stride__
, i = __index__)
class
CublasHandleHolder
{
public:
...
...
paddle/phi/backends/gpu/cuda/cuda_helper.h
浏览文件 @
860f6077
...
...
@@ -62,11 +62,12 @@ namespace gpu {
*
*/
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
for (index_type i = __index__; __index__ < (num); \
__index__ += __stride__, i = __index__)
}
// namespace gpu
}
// namespace backends
...
...
paddle/phi/backends/gpu/rocm/rocm_helper.h
浏览文件 @
860f6077
...
...
@@ -65,8 +65,9 @@ namespace gpu {
#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \
int64_t __index__ = \
static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x; \
for (index_type i = __index__; __index__ < (num); \
__index__ +=
hipBlockDim_x * hipGridDim_x
, i = __index__)
__index__ +=
__stride__
, i = __index__)
}
// namespace gpu
}
// namespace backends
...
...
paddle/phi/kernels/funcs/broadcast_function.h
浏览文件 @
860f6077
...
...
@@ -468,6 +468,397 @@ void LaunchBroadcastKernel(
func
);
}
#ifndef PADDLE_WITH_XPU_KP
HOSTDEVICE
static
int64_t
ConvertSrcIdxToDstIdx
(
int64_t
src_idx
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
src_strides
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
dst_strides
,
int
rank
)
{
int64_t
dst_idx
=
0
;
int64_t
old_src_idx
=
src_idx
;
for
(
int
k
=
0
;
k
<
rank
;
++
k
)
{
auto
local_idx
=
src_idx
/
src_strides
[
k
+
1
];
src_idx
-=
local_idx
*
src_strides
[
k
+
1
];
if
(
dst_strides
[
k
]
!=
dst_strides
[
k
+
1
])
{
dst_idx
+=
local_idx
*
dst_strides
[
k
+
1
];
}
}
return
dst_idx
;
}
template
<
typename
T
,
int
VecSize
,
bool
IsBoundary
>
HOSTDEVICE
static
void
ReadVecDataWithInt64Index
(
const
T
*
in
,
int64_t
idx
,
bool
need_broadcast
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
src_strides
,
const
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
&
dst_strides
,
int
rank
,
int
n
,
phi
::
AlignedVector
<
T
,
VecSize
>
*
out
)
{
if
(
IsBoundary
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
(
*
out
)[
i
]
=
in
[
ConvertSrcIdxToDstIdx
(
idx
+
i
,
src_strides
,
dst_strides
,
rank
)];
}
}
else
{
if
(
!
need_broadcast
)
{
phi
::
Load
<
T
,
VecSize
>
(
in
+
idx
,
out
);
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
(
*
out
)[
i
]
=
in
[
ConvertSrcIdxToDstIdx
(
idx
+
i
,
src_strides
,
dst_strides
,
rank
)];
}
}
}
}
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
,
int
NumIns
>
struct
ApplyFunctorWithInt64IndexHelper
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
);
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
0
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
());
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
1
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
(
ins_vec
[
0
][
i
]));
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
2
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
(
ins_vec
[
0
][
i
],
ins_vec
[
1
][
i
]));
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
>
struct
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
3
>
{
HOSTDEVICE
static
OutT
Run
(
const
phi
::
AlignedVector
<
InT
,
VecSize
>
*
ins_vec
,
Functor
functor
,
int
i
)
{
return
static_cast
<
OutT
>
(
functor
(
ins_vec
[
0
][
i
],
ins_vec
[
1
][
i
],
ins_vec
[
2
][
i
]));
}
};
template
<
int
N
>
struct
MaxWithOne
{
static
constexpr
auto
kValue
=
(
N
>=
1
?
N
:
1
);
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
VecSize
,
int
NumIns
>
__global__
void
BroadcastKernelWithInt64Index
(
phi
::
Array
<
const
InT
*
,
MaxWithOne
<
NumIns
>::
kValue
>
ins
,
OutT
*
out
,
phi
::
Array
<
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
,
MaxWithOne
<
NumIns
>::
kValue
>
ins_strides
,
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
out_strides
,
phi
::
Array
<
bool
,
MaxWithOne
<
NumIns
>::
kValue
>
need_broadcasts
,
int
rank
,
Functor
functor
)
{
int64_t
numel
=
out_strides
[
0
];
int64_t
idx
=
(
static_cast
<
int64_t
>
(
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
)
*
VecSize
;
int64_t
stride
=
static_cast
<
int64_t
>
(
blockDim
.
x
)
*
gridDim
.
x
*
VecSize
;
int64_t
limit
=
numel
-
VecSize
;
phi
::
Array
<
phi
::
AlignedVector
<
InT
,
VecSize
>
,
MaxWithOne
<
NumIns
>::
kValue
>
ins_vec
;
phi
::
AlignedVector
<
OutT
,
VecSize
>
out_vec
;
for
(;
idx
<=
limit
;
idx
+=
stride
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NumIns
;
++
i
)
{
ReadVecDataWithInt64Index
<
InT
,
VecSize
,
false
>
(
ins
[
i
],
idx
,
need_broadcasts
[
i
],
out_strides
,
ins_strides
[
i
],
rank
,
VecSize
,
&
ins_vec
[
i
]);
}
#pragma unroll
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
out_vec
[
i
]
=
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
NumIns
>::
Run
(
ins_vec
.
Get
(),
functor
,
i
);
}
phi
::
Store
<
OutT
,
VecSize
>
(
out_vec
,
out
+
idx
);
}
if
(
idx
<
numel
)
{
int
remain
=
numel
-
idx
;
// remain is always less than VecSize, therefore
// `int` is enough here
#pragma unroll
for
(
int
i
=
0
;
i
<
NumIns
;
++
i
)
{
ReadVecDataWithInt64Index
<
InT
,
VecSize
,
true
>
(
ins
[
i
],
idx
,
need_broadcasts
[
i
],
out_strides
,
ins_strides
[
i
],
rank
,
remain
,
&
ins_vec
[
i
]);
}
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
out
[
idx
+
i
]
=
ApplyFunctorWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
VecSize
,
NumIns
>::
Run
(
ins_vec
.
Get
(),
functor
,
i
);
}
}
}
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
>
struct
LaunchBroadcastKernelWithInt64IndexHelper
{
static
void
Run
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
axis
,
Functor
functor
)
{
PADDLE_THROW
(
phi
::
errors
::
PermissionDenied
(
"Unreachable code branch. This may be a bug."
));
}
};
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
Arity
,
int
VecSize
>
struct
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
Arity
,
/*NumOuts=*/
1
,
VecSize
>
{
static
void
Run
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
axis
,
Functor
functor
)
{
phi
::
Array
<
const
InT
*
,
MaxWithOne
<
Arity
>::
kValue
>
ins_ptrs
;
for
(
int
i
=
0
;
i
<
Arity
;
++
i
)
{
ins_ptrs
[
i
]
=
ins
[
i
]
->
data
<
InT
>
();
}
auto
*
out_tensor
=
(
*
outs
)[
0
];
auto
*
out_ptr
=
ctx
.
Alloc
<
OutT
>
(
out_tensor
);
phi
::
Array
<
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
>
,
MaxWithOne
<
Arity
>::
kValue
>
ins_expand_dims
;
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
>
broadcast_out_dims
;
int
rank
;
if
(
Arity
==
1
)
{
rank
=
ins
[
0
]
->
dims
().
size
();
for
(
int
i
=
0
;
i
<
rank
;
++
i
)
{
broadcast_out_dims
[
i
]
=
ins
[
0
]
->
dims
()[
i
];
}
ins_expand_dims
[
0
]
=
broadcast_out_dims
;
}
else
if
(
Arity
>=
2
)
{
CalculateBroadcastDims
(
ins
[
0
]
->
dims
().
Get
(),
ins
[
1
]
->
dims
().
Get
(),
ins
[
0
]
->
dims
().
size
(),
ins
[
1
]
->
dims
().
size
(),
axis
,
ins_expand_dims
[
0
].
GetMutable
(),
ins_expand_dims
[
1
].
GetMutable
(),
broadcast_out_dims
.
GetMutable
(),
&
rank
);
for
(
int
i
=
2
;
i
<
Arity
;
++
i
)
{
auto
tmp_dims
=
broadcast_out_dims
;
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
>
tmp_expand_dims
;
int
tmp_rank
;
PADDLE_ENFORCE_GE
(
rank
,
ins
[
i
]
->
dims
().
size
(),
phi
::
errors
::
InvalidArgument
(
"Unsupported reverse broadcast when the input "
"tensor number is larger than 2."
));
CalculateBroadcastDims
(
tmp_dims
.
Get
(),
ins
[
i
]
->
dims
().
Get
(),
rank
,
ins
[
i
]
->
dims
().
size
(),
axis
,
tmp_expand_dims
.
GetMutable
(),
ins_expand_dims
[
i
].
GetMutable
(),
broadcast_out_dims
.
GetMutable
(),
&
tmp_rank
);
PADDLE_ENFORCE_EQ
(
rank
,
tmp_rank
,
phi
::
errors
::
InvalidArgument
(
"Wrong broadcast algorithm. This may be a bug."
));
}
}
phi
::
Array
<
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
,
MaxWithOne
<
Arity
>::
kValue
>
ins_strides
;
phi
::
Array
<
bool
,
MaxWithOne
<
Arity
>::
kValue
>
need_broadcasts
;
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
out_strides
;
const
auto
&
out_dims
=
out_tensor
->
dims
();
if
(
rank
<=
out_dims
.
size
())
{
out_strides
=
ShapeToStride
(
out_dims
.
Get
(),
rank
);
}
else
{
out_strides
=
ShapeToStride
(
broadcast_out_dims
.
Get
(),
rank
);
}
for
(
int
i
=
0
;
i
<
Arity
;
++
i
)
{
ins_strides
[
i
]
=
ShapeToStride
(
ins_expand_dims
[
i
].
Get
(),
rank
);
need_broadcasts
[
i
]
=
!
IsSameShape
(
out_strides
.
Get
(),
ins_strides
[
i
].
Get
(),
rank
+
1
);
}
int64_t
numel
=
out_strides
[
0
];
auto
gpu_config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
BroadcastKernelWithInt64Index
<
InT
,
OutT
,
Functor
,
VecSize
,
Arity
>
<<<
gpu_config
.
block_per_grid
,
gpu_config
.
thread_per_block
,
0
,
ctx
.
stream
()
>>>
(
ins_ptrs
,
out_ptr
,
ins_strides
,
out_strides
,
need_broadcasts
,
rank
,
functor
);
}
private:
static
void
CalculateBroadcastDims
(
const
int64_t
*
x_dims
,
const
int64_t
*
y_dims
,
int
nx
,
int
ny
,
int
axis
,
int64_t
*
x_out_dims
,
int64_t
*
y_out_dims
,
int64_t
*
broadcast_out_dims
,
int
*
length
)
{
PADDLE_ENFORCE_GE
(
axis
,
0
,
phi
::
errors
::
InvalidArgument
(
"Invalid axis value: %d"
,
axis
));
if
(
nx
==
ny
)
{
*
length
=
nx
;
for
(
int
i
=
0
;
i
<
nx
;
++
i
)
{
if
(
x_dims
[
i
]
!=
y_dims
[
i
])
{
PADDLE_ENFORCE_EQ
(
x_dims
[
i
]
==
1
||
y_dims
[
i
]
==
1
,
true
,
phi
::
errors
::
InvalidArgument
(
"Cannot broadcast input shape where "
"x_dims[%d] = %d, y_dims[%d] = %d."
,
i
,
x_dims
[
i
],
i
,
y_dims
[
i
]));
}
broadcast_out_dims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
]);
x_out_dims
[
i
]
=
x_dims
[
i
];
y_out_dims
[
i
]
=
y_dims
[
i
];
}
}
else
if
(
nx
>
ny
)
{
*
length
=
nx
;
for
(
int
i
=
nx
-
axis
;
i
<
ny
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
y_dims
[
i
],
1
,
phi
::
errors
::
InvalidArgument
(
"The trailing Y.shape[%d] should be 1 but got %d."
,
i
,
y_dims
[
i
]));
}
for
(
int
i
=
0
;
i
<
nx
;
++
i
)
{
if
(
i
>=
axis
&&
i
-
axis
<
ny
)
{
if
(
x_dims
[
i
]
!=
y_dims
[
i
-
axis
])
{
PADDLE_ENFORCE_EQ
(
x_dims
[
i
]
==
1
||
y_dims
[
i
-
axis
]
==
1
,
true
,
phi
::
errors
::
InvalidArgument
(
"Cannot broadcast input shape where "
"x_dims[%d] = %d, y_dims[%d] = %d."
,
i
,
x_dims
[
i
],
i
-
axis
,
y_dims
[
i
-
axis
]));
}
broadcast_out_dims
[
i
]
=
std
::
max
(
x_dims
[
i
],
y_dims
[
i
-
axis
]);
x_out_dims
[
i
]
=
x_dims
[
i
];
y_out_dims
[
i
]
=
y_dims
[
i
-
axis
];
}
else
{
broadcast_out_dims
[
i
]
=
x_dims
[
i
];
x_out_dims
[
i
]
=
x_dims
[
i
];
y_out_dims
[
i
]
=
1
;
}
}
}
else
{
CalculateBroadcastDims
(
y_dims
,
x_dims
,
ny
,
nx
,
axis
,
y_out_dims
,
x_out_dims
,
broadcast_out_dims
,
length
);
}
}
static
bool
IsSameShape
(
const
int64_t
*
x
,
const
int64_t
*
y
,
int
rank
)
{
for
(
int
i
=
0
;
i
<
rank
;
++
i
)
{
if
(
x
[
i
]
!=
y
[
i
])
return
false
;
}
return
true
;
}
static
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
ShapeToStride
(
const
int64_t
*
arr
,
int
rank
)
{
phi
::
Array
<
int64_t
,
phi
::
DDim
::
kMaxRank
+
1
>
strides
;
strides
[
rank
]
=
1
;
for
(
int
i
=
rank
-
1
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
arr
[
i
];
}
return
strides
;
}
};
#endif
template
<
ElementwiseType
ET
,
typename
InT
,
typename
OutT
,
...
...
@@ -509,6 +900,63 @@ void BroadcastKernelForDifferentVecSize(
outs
->
size
(),
NumOuts
));
#ifndef PADDLE_WITH_XPU_KP
constexpr
bool
kEnabledInt64IndexKernel
=
(
NumOuts
==
1
&&
kArity
<=
3
);
bool
use_int64_index_kernel
=
kEnabledInt64IndexKernel
&&
(
*
outs
)[
0
]
->
numel
()
>=
std
::
numeric_limits
<
int32_t
>::
max
();
if
(
use_int64_index_kernel
)
{
int
vec_size
=
GetVecsize
<
InT
,
OutT
>
(
ins
,
outs
);
switch
(
vec_size
)
{
case
VecSizeL
:
{
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeL
>::
Run
(
ctx
,
ins
,
outs
,
axis
,
func
);
break
;
}
case
VecSizeM
:
{
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeM
>::
Run
(
ctx
,
ins
,
outs
,
axis
,
func
);
break
;
}
case
VecSizeS
:
{
LaunchBroadcastKernelWithInt64IndexHelper
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeS
>::
Run
(
ctx
,
ins
,
outs
,
axis
,
func
);
break
;
}
default:
{
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"Unsupported vectorized size: %d!"
,
vec_size
));
break
;
}
}
return
;
}
#endif
// mergedim and get vec_size
const
auto
merge_dims
=
DimensionsTransform
(
ins
,
(
*
outs
)[
0
]
->
dims
(),
axis
);
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
kArity
>
configs
;
...
...
paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
浏览文件 @
860f6077
...
...
@@ -92,7 +92,7 @@ struct BinaryOperation {
std
::
vector
<
DenseTensor
*>
outs
{
output
};
paddle
::
operators
::
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
-
1
,
BinaryFunctor
<
T
>
());
dev_ctx
,
ins
,
&
outs
,
0
,
BinaryFunctor
<
T
>
());
}
};
...
...
tools/dockerfile/Dockerfile.release16
浏览文件 @
860f6077
...
...
@@ -101,8 +101,13 @@ RUN curl -s -q https://glide.sh/get | sh
# Downgrade TensorRT
COPY tools/dockerfile/build_scripts /build_scripts
RUN bash /build_scripts/install_nccl2.sh
RUN rm -rf /build_scripts
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# # So install a newer version here.
RUN bash /build_scripts/install_patchelf.sh
RUN rm -rf /build_scripts
# git credential to skip password typing
RUN git config --global credential.helper store
...
...
@@ -143,13 +148,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
RUN apt-get install libprotobuf-dev -y
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
CMD source ~/.bashrc
...
...
tools/dockerfile/Dockerfile.release18
浏览文件 @
860f6077
...
...
@@ -28,6 +28,10 @@ RUN apt-get update && \
# Downgrade gcc&&g++
WORKDIR /usr/bin
COPY tools/dockerfile/build_scripts /build_scripts
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN bash /build_scripts/install_patchelf.sh
RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts
RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
...
...
@@ -99,14 +103,6 @@ RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
COPY ./python/requirements.txt /root/
RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
#CMD source ~/.bashrc
...
...
tools/dockerfile/Dockerfile.ubuntu
浏览文件 @
860f6077
...
...
@@ -143,9 +143,14 @@ RUN curl -s -q https://glide.sh/get | sh
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
# Downgrade TensorRT
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
COPY tools/dockerfile/build_scripts /build_scripts
RUN bash /build_scripts/install_trt.sh && \
bash /build_scripts/install_nccl2.sh
bash /build_scripts/install_nccl2.sh && \
bash /build_scripts/install_patchelf.sh
RUN rm -rf /build_scripts
# git credential to skip password typing
...
...
@@ -236,13 +241,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
RUN apt-get install libprotobuf-dev -y
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
CMD source ~/.bashrc
...
...
tools/dockerfile/Dockerfile.ubuntu18
浏览文件 @
860f6077
...
...
@@ -35,6 +35,10 @@ RUN apt-get update --allow-unauthenticated && \
WORKDIR /usr/bin
COPY tools/dockerfile/build_scripts /build_scripts
RUN bash /build_scripts/install_trt.sh
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# # So install a newer version here.
RUN bash /build_scripts/install_patchelf.sh
RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts
RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
...
...
@@ -151,14 +155,6 @@ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
pip3.8 --no-cache-dir install -r /root/requirements.txt && \
pip3.9 --no-cache-dir install -r /root/requirements.txt
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.
RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
dpkg -i patchelf_0.10-2_amd64.deb && \
rm -rf patchelf_0.10-2_amd64.deb
# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
#CMD source ~/.bashrc
...
...
tools/dockerfile/build_scripts/build.sh
浏览文件 @
860f6077
...
...
@@ -106,7 +106,7 @@ export SSL_CERT_FILE=/opt/_internal/certs.pem
# tar -xzf patchelf-0.9njs2.tar.gz
# (cd patchelf-0.9njs2 && ./configure && make && make install)
# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
yum
install
-y
patchelf
sh
"
$MY_DIR
/install_patchelf.sh"
# Install latest pypi release of auditwheel
#LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
...
...
tools/dockerfile/build_scripts/install_patchelf.sh
0 → 100644
浏览文件 @
860f6077
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-e
TMP_DIR
=
patchelf_tmp
rm
-rf
"
$TMP_DIR
"
git clone
-b
0.15.0 https://github.com/NixOS/patchelf
"
$TMP_DIR
"
cd
"
$TMP_DIR
"
./bootstrap.sh
./configure
make
make
install
cd
..
rm
-rf
"
$TMP_DIR
"
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录