Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d5afc1ba
P
Paddle
项目概览
PaddlePaddle
/
Paddle
接近 2 年 前同步成功
通知
2323
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d5afc1ba
编写于
6月 07, 2022
作者:
S
shixingbo
提交者:
GitHub
6月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimized the performance of activation op in XPU2 (#43187)
上级
9551e466
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
73 addition
and
36 deletion
+73
-36
paddle/fluid/operators/optimizers/cast_with_ptr.h
paddle/fluid/operators/optimizers/cast_with_ptr.h
+1
-1
paddle/phi/kernels/funcs/elementwise_base.h
paddle/phi/kernels/funcs/elementwise_base.h
+53
-24
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+3
-2
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+16
-9
未找到文件。
paddle/fluid/operators/optimizers/cast_with_ptr.h
浏览文件 @
d5afc1ba
...
@@ -44,7 +44,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x,
...
@@ -44,7 +44,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x,
phi
::
Array
<
_ptr_
OutT
*
,
1
>
out_arr
;
phi
::
Array
<
_ptr_
OutT
*
,
1
>
out_arr
;
out_arr
[
0
]
=
y
;
out_arr
[
0
]
=
y
;
phi
::
funcs
::
VectorizedElementwiseKernel
<
OutT
,
FunctorT
,
1
,
1
,
VecSize
>
phi
::
funcs
::
VectorizedElementwiseKernel
<
OutT
,
FunctorT
,
1
,
1
,
VecSize
>
<<<
block
,
thread
,
0
,
stream
>>>
(
in_arr
,
out_arr
,
n
,
main_offset
,
<<<
block
,
thread
,
0
,
stream
>>>
(
in_arr
,
out_arr
,
n
,
main_offset
,
VecSize
,
FunctorT
());
FunctorT
());
}
}
...
...
paddle/phi/kernels/funcs/elementwise_base.h
浏览文件 @
d5afc1ba
...
@@ -513,19 +513,23 @@ struct Loader {
...
@@ -513,19 +513,23 @@ struct Loader {
ArgsT
*
args
,
ArgsT
*
args
,
int
num
,
int
num
,
int
data_offset
,
int
data_offset
,
int
read_lens
,
bool
is_boundary
)
{
bool
is_boundary
)
{
using
Type
=
std
::
tuple_element_t
<
Index
,
ArgsT
>
;
using
Type
=
std
::
tuple_element_t
<
Index
,
ArgsT
>
;
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
args
,
static_cast
<
Type
>
(
1.0
f
));
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
args
,
static_cast
<
Type
>
(
1.0
f
),
read_lens
);
if
(
is_boundary
)
{
if
(
is_boundary
)
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
true
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
true
>
(
args
,
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
data_offset
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
data_offset
,
num
);
num
,
read_lens
);
}
else
{
}
else
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
false
>
(
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
false
>
(
args
,
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
data_offset
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
data_offset
,
num
);
num
,
read_lens
);
}
}
}
}
};
};
...
@@ -660,11 +664,20 @@ template <typename OutT,
...
@@ -660,11 +664,20 @@ template <typename OutT,
typename
ArgsT
,
typename
ArgsT
,
int
Arity
>
int
Arity
>
struct
SameDimsElementwisePrimitiveCaller
{
struct
SameDimsElementwisePrimitiveCaller
{
__device__
inline
void
operator
()(
Functor
func
,
ArgsT
*
args
,
OutT
*
result
)
{
__device__
inline
void
operator
()(
Functor
func
,
ArgsT
*
args
,
OutT
*
result
,
int
read_lens
)
{
#ifdef PADDLE_WITH_XPU_KP
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
result
[
idx
]
=
static_cast
<
OutT
>
(
Apply
(
func
,
args
[
idx
]));
}
#else
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
VecSize
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
VecSize
;
++
idx
)
{
result
[
idx
]
=
static_cast
<
OutT
>
(
Apply
(
func
,
args
[
idx
]));
result
[
idx
]
=
static_cast
<
OutT
>
(
Apply
(
func
,
args
[
idx
]));
}
}
#endif
}
}
};
};
...
@@ -750,6 +763,7 @@ __device__ void VectorizedElementwiseKernelImpl(
...
@@ -750,6 +763,7 @@ __device__ void VectorizedElementwiseKernelImpl(
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
int
num
,
int
num
,
int
data_offset
,
int
data_offset
,
int
read_lens
,
Functor
func
)
{
Functor
func
)
{
using
Traits
=
paddle
::
platform
::
FunctionTraits
<
Functor
>
;
using
Traits
=
paddle
::
platform
::
FunctionTraits
<
Functor
>
;
using
ArgsT
=
typename
Traits
::
ArgsTuple
;
using
ArgsT
=
typename
Traits
::
ArgsTuple
;
...
@@ -757,16 +771,16 @@ __device__ void VectorizedElementwiseKernelImpl(
...
@@ -757,16 +771,16 @@ __device__ void VectorizedElementwiseKernelImpl(
ConditionalT
<
OutT
,
NumOuts
>
result
[
VecSize
];
ConditionalT
<
OutT
,
NumOuts
>
result
[
VecSize
];
Unroller
<
Loader
,
VecSize
,
Arity
>::
step
(
Unroller
<
Loader
,
VecSize
,
Arity
>::
step
(
in
,
args
,
num
,
data_offset
,
IsBoundary
);
in
,
args
,
num
,
data_offset
,
read_lens
,
IsBoundary
);
SameDimsElementwisePrimitiveCaller
<
ConditionalT
<
OutT
,
NumOuts
>
,
SameDimsElementwisePrimitiveCaller
<
ConditionalT
<
OutT
,
NumOuts
>
,
VecSize
,
VecSize
,
Functor
,
Functor
,
ArgsT
,
ArgsT
,
Arity
>
()(
func
,
args
,
result
);
Arity
>
()(
func
,
args
,
result
,
read_lens
);
ElementwiseWriteDataCaller
<
OutT
,
VecSize
,
IsBoundary
,
NumOuts
>
()(
ElementwiseWriteDataCaller
Bc
<
OutT
,
VecSize
,
IsBoundary
,
NumOuts
>
()(
outs
,
result
,
data_offset
,
num
);
outs
,
result
,
data_offset
,
num
,
read_lens
);
}
}
template
<
typename
OutT
,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
>
template
<
typename
OutT
,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
>
...
@@ -775,9 +789,10 @@ __global__ void VectorizedElementwiseKernel(
...
@@ -775,9 +789,10 @@ __global__ void VectorizedElementwiseKernel(
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
int
size
,
int
size
,
int
main_offset
,
int
main_offset
,
int
read_lens
,
Functor
func
)
{
Functor
func
)
{
int
data_offset
=
BLOCK_ID_X
*
BLOCK_NUM_X
*
VecSize
;
int
data_offset
=
BLOCK_ID_X
*
BLOCK_NUM_X
*
read_lens
;
int
stride
=
BLOCK_NUM_X
*
GRID_NUM_X
*
VecSize
;
int
stride
=
BLOCK_NUM_X
*
GRID_NUM_X
*
read_lens
;
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
VectorizedElementwiseKernelImpl
<
OutT
,
VectorizedElementwiseKernelImpl
<
OutT
,
Functor
,
Functor
,
...
@@ -785,7 +800,7 @@ __global__ void VectorizedElementwiseKernel(
...
@@ -785,7 +800,7 @@ __global__ void VectorizedElementwiseKernel(
NumOuts
,
NumOuts
,
VecSize
,
VecSize
,
false
>
(
false
>
(
ins
,
outs
,
VecSize
*
BLOCK_NUM_X
,
data_offset
,
func
);
ins
,
outs
,
read_lens
*
BLOCK_NUM_X
,
data_offset
,
read_lens
,
func
);
}
}
int
num
=
size
-
data_offset
;
int
num
=
size
-
data_offset
;
...
@@ -795,7 +810,8 @@ __global__ void VectorizedElementwiseKernel(
...
@@ -795,7 +810,8 @@ __global__ void VectorizedElementwiseKernel(
Arity
,
Arity
,
NumOuts
,
NumOuts
,
VecSize
,
VecSize
,
true
>
(
ins
,
outs
,
num
,
data_offset
,
func
);
true
>
(
ins
,
outs
,
num
,
data_offset
,
read_lens
,
func
);
}
}
}
}
...
@@ -803,6 +819,7 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
...
@@ -803,6 +819,7 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void
ElementwiseCudaKernel
(
const
KPDevice
&
ctx
,
void
ElementwiseCudaKernel
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
read_lens
,
Functor
func
)
{
Functor
func
)
{
auto
numel
=
auto
numel
=
(
*
outs
)[
0
]
->
numel
();
// To avoid running errors when ins.size()== 0
(
*
outs
)[
0
]
->
numel
();
// To avoid running errors when ins.size()== 0
...
@@ -817,10 +834,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
...
@@ -817,10 +834,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int
block_size
=
64
;
int
block_size
=
64
;
int
grid_size
=
8
;
int
grid_size
=
8
;
auto
stream
=
ctx
.
x_context
()
->
xpu_stream
;
auto
stream
=
ctx
.
x_context
()
->
xpu_stream
;
int
main_offset
=
(
numel
/
(
VecSize
*
block_size
))
*
VecSize
*
block_size
;
int
main_offset
=
(
numel
/
(
read_lens
*
block_size
))
*
read_lens
*
block_size
;
VectorizedElementwiseKernel
<
OutT
,
Functor
,
Arity
,
NumOuts
,
VecSize
>
VectorizedElementwiseKernel
<
OutT
,
Functor
,
Arity
,
NumOuts
,
VecSize
>
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
ins_data
,
outs_data
,
numel
,
main_offset
,
func
);
ins_data
,
outs_data
,
numel
,
main_offset
,
read_lens
,
func
);
#else
#else
auto
gpu_config
=
auto
gpu_config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
...
@@ -829,7 +846,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
...
@@ -829,7 +846,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
auto
stream
=
ctx
.
stream
();
auto
stream
=
ctx
.
stream
();
VectorizedElementwiseKernel
<
OutT
,
Functor
,
Arity
,
NumOuts
,
VecSize
>
VectorizedElementwiseKernel
<
OutT
,
Functor
,
Arity
,
NumOuts
,
VecSize
>
<<<
gpu_config
.
block_per_grid
,
gpu_config
.
thread_per_block
,
0
,
stream
>>>
(
<<<
gpu_config
.
block_per_grid
,
gpu_config
.
thread_per_block
,
0
,
stream
>>>
(
ins_data
,
outs_data
,
numel
,
main_offset
,
func
);
ins_data
,
outs_data
,
numel
,
main_offset
,
VecSize
,
func
);
#endif
#endif
}
}
...
@@ -868,20 +885,32 @@ void ElementwiseKernel(const KPDevice &ctx,
...
@@ -868,20 +885,32 @@ void ElementwiseKernel(const KPDevice &ctx,
}
}
}
}
#ifdef PADDLE_WITH_XPU_KP
const
int
buf_size
=
256
;
int
numel
=
(
*
outs
)[
0
]
->
numel
();
int
block_size
=
64
;
int
grid_size
=
8
;
int
nthreads
=
block_size
*
grid_size
;
int
read_lens
=
std
::
min
(
buf_size
,
kps
::
details
::
RoundUpDiv
(
numel
,
32
*
nthreads
)
*
32
);
int
vec_size
=
buf_size
;
#else
// calculate the max vec_size for all ins and outs
// calculate the max vec_size for all ins and outs
int
vec_size
=
GetVectorizedSizeForTensors
<
OutT
,
Functor
>
(
ins
,
*
outs
);
int
vec_size
=
GetVectorizedSizeForTensors
<
OutT
,
Functor
>
(
ins
,
*
outs
);
int
read_lens
=
vec_size
;
#endif
switch
(
vec_size
)
{
switch
(
vec_size
)
{
case
4
:
case
VecSizeL
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
4
>
(
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeL
>
(
ctx
,
ins
,
outs
,
func
);
ctx
,
ins
,
outs
,
read_lens
,
func
);
break
;
break
;
case
2
:
case
VecSizeM
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
2
>
(
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeM
>
(
ctx
,
ins
,
outs
,
func
);
ctx
,
ins
,
outs
,
read_lens
,
func
);
break
;
break
;
case
1
:
case
VecSizeS
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
1
>
(
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeS
>
(
ctx
,
ins
,
outs
,
func
);
ctx
,
ins
,
outs
,
read_lens
,
func
);
break
;
break
;
default:
{
default:
{
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
d5afc1ba
...
@@ -259,7 +259,7 @@ __device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) {
...
@@ -259,7 +259,7 @@ __device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) {
* it supports different data types of inputs.
* it supports different data types of inputs.
*/
*/
template
<
typename
T
,
typename
ArgsT
,
int
Index
,
int
NX
>
template
<
typename
T
,
typename
ArgsT
,
int
Index
,
int
NX
>
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
)
{
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
,
int
read_lens
)
{
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
NX
;
i
++
)
{
for
(
int
i
=
0
;
i
<
NX
;
i
++
)
{
std
::
get
<
Index
>
(
dst
[
i
])
=
init_data
;
std
::
get
<
Index
>
(
dst
[
i
])
=
init_data
;
...
@@ -382,7 +382,8 @@ template <typename T,
...
@@ -382,7 +382,8 @@ template <typename T,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
ArgsT
*
dst
,
__device__
__forceinline__
void
ReadData
(
ArgsT
*
dst
,
const
T
*
__restrict__
src
,
const
T
*
__restrict__
src
,
int
num
)
{
int
num
,
int
read_lens
)
{
if
(
IsBoundary
)
{
// blockDim.x * NX > num
if
(
IsBoundary
)
{
// blockDim.x * NX > num
int
thread_offset
=
threadIdx
.
x
*
NX
;
int
thread_offset
=
threadIdx
.
x
*
NX
;
#pragma unroll
#pragma unroll
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
浏览文件 @
d5afc1ba
...
@@ -21,6 +21,8 @@ namespace phi {
...
@@ -21,6 +21,8 @@ namespace phi {
namespace
kps
{
namespace
kps
{
namespace
details
{
namespace
details
{
int
RoundUpDiv
(
int
n
,
int
k
)
{
return
(
n
+
k
-
1
)
/
k
;
}
enum
class
OptType
{
// Optimize type of calc after input shape compressed
enum
class
OptType
{
// Optimize type of calc after input shape compressed
CanNotOptimize
=
-
1
,
// can not optimize, broadcast first
CanNotOptimize
=
-
1
,
// can not optimize, broadcast first
N_1
,
// just like {1} op {100} or {100} op {1}
N_1
,
// just like {1} op {100} or {100} op {1}
...
@@ -425,9 +427,10 @@ __device__ __inline__ void Init(T* dst, T init_data, int read_lens) {
...
@@ -425,9 +427,10 @@ __device__ __inline__ void Init(T* dst, T init_data, int read_lens) {
* it supports different data types of inputs.
* it supports different data types of inputs.
*/
*/
template
<
typename
T
,
typename
ArgsT
,
int
Index
,
int
NX
>
template
<
typename
T
,
typename
ArgsT
,
int
Index
,
int
NX
>
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
)
{
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
,
int
read_lens
)
{
mfence
();
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
NX
;
i
++
)
{
for
(
int
i
=
0
;
i
<
read_lens
;
i
++
)
{
std
::
get
<
Index
>
(
dst
[
i
])
=
init_data
;
std
::
get
<
Index
>
(
dst
[
i
])
=
init_data
;
}
}
}
}
...
@@ -523,22 +526,24 @@ template <typename T,
...
@@ -523,22 +526,24 @@ template <typename T,
bool
IsBoundary
>
bool
IsBoundary
>
__device__
__forceinline__
void
ReadData
(
ArgsT
*
dst
,
__device__
__forceinline__
void
ReadData
(
ArgsT
*
dst
,
const
T
_global_ptr_
*
src
,
const
T
_global_ptr_
*
src
,
int
num
)
{
int
num
,
int
thread_offset
=
core_id
()
*
NX
;
int
read_lens
)
{
int
thread_offset
=
core_id
()
*
read_lens
;
__local__
T
in_temp
[
1
];
__local__
T
in_temp
[
1
];
__local__
T
in_vec
[
NX
];
__local__
T
in_vec
[
NX
];
if
(
IsBoundary
)
{
// core_num() *
NX
> num
if
(
IsBoundary
)
{
// core_num() *
read_lens
> num
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
if
(
idx
+
thread_offset
<
num
)
{
if
(
idx
+
thread_offset
<
num
)
{
GM2LM
(
src
+
thread_offset
+
idx
,
in_temp
,
sizeof
(
T
));
GM2LM
(
src
+
thread_offset
+
idx
,
in_temp
,
sizeof
(
T
));
std
::
get
<
Index
>
(
dst
[
idx
])
=
in_temp
[
0
];
std
::
get
<
Index
>
(
dst
[
idx
])
=
in_temp
[
0
];
mfence
();
}
}
}
}
}
else
{
// core_num() *
NX
< num
}
else
{
// core_num() *
read_lens
< num
GM2LM
(
src
+
thread_offset
,
in_vec
,
NX
*
sizeof
(
T
));
GM2LM
(
src
+
thread_offset
,
in_vec
,
read_lens
*
sizeof
(
T
));
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
std
::
get
<
Index
>
(
dst
[
idx
])
=
in_vec
[
idx
];
std
::
get
<
Index
>
(
dst
[
idx
])
=
in_vec
[
idx
];
}
}
}
}
...
@@ -727,10 +732,12 @@ __device__ void WriteData(T _global_ptr_* dst,
...
@@ -727,10 +732,12 @@ __device__ void WriteData(T _global_ptr_* dst,
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
if
(
idx
+
thread_offset
<
num
)
{
if
(
idx
+
thread_offset
<
num
)
{
in_temp
[
0
]
=
src
[
idx
];
in_temp
[
0
]
=
src
[
idx
];
mfence
();
LM2GM
(
in_temp
,
dst
+
idx
+
thread_offset
,
sizeof
(
T
));
LM2GM
(
in_temp
,
dst
+
idx
+
thread_offset
,
sizeof
(
T
));
}
}
}
}
}
else
{
// core_num() * read_lens < num
}
else
{
// core_num() * read_lens < num
mfence
();
LM2GM
(
src
,
dst
+
thread_offset
,
read_lens
*
sizeof
(
T
));
LM2GM
(
src
,
dst
+
thread_offset
,
read_lens
*
sizeof
(
T
));
}
}
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录