Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
3b09299b
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3b09299b
编写于
6月 12, 2020
作者:
W
wilfChen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
gpu codex warning fix
上级
4df861cb
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
28 addition
and
31 deletion
+28
-31
mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cu
mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cu
+16
-16
mindspore/ccsrc/kernel/gpu/data/dataset_iterator_kernel.cc
mindspore/ccsrc/kernel/gpu/data/dataset_iterator_kernel.cc
+2
-1
mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
+4
-4
mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
+4
-4
mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc
mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc
+1
-3
mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc
mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc
+1
-3
未找到文件。
mindspore/ccsrc/kernel/gpu/cuda_impl/layer_norm_impl.cu
浏览文件 @
3b09299b
...
...
@@ -23,7 +23,7 @@ constexpr int NUM_PER_THREAD_REDUCE = 4;
constexpr
int
WARP_SIZE
=
32
;
template
<
typename
T
>
inline
__device__
void
MeanAndVarAccumulation
(
T
*
mean
,
T
*
var
,
T
*
num
,
const
T
&
val
)
{
inline
__device__
void
MeanAndVarAccumulation
(
T
*
mean
,
T
*
var
,
T
*
num
,
const
T
&
val
)
{
// Welford Algorithm:
// \mu_k = \mu_{k-1} + (x_k - \mu_{k-1})/k
// \sigma_k^2 = \sigma_{k-1}^2 + (x_k - \mu_{k-1}) * (x_k - \mu_k)
...
...
@@ -34,7 +34,7 @@ inline __device__ void MeanAndVarAccumulation(T* mean, T* var, T* num, const T&
}
template
<
typename
T
>
inline
__device__
void
MeanAndVarMerge
(
T
*
m1
,
T
*
v1
,
T
*
n1
,
const
T
&
m2
,
const
T
&
v2
,
const
T
&
n2
)
{
inline
__device__
void
MeanAndVarMerge
(
T
*
m1
,
T
*
v1
,
T
*
n1
,
const
T
&
m2
,
const
T
&
v2
,
const
T
&
n2
)
{
if
(
n2
==
0
)
{
return
;
}
...
...
@@ -46,7 +46,7 @@ inline __device__ void MeanAndVarMerge(T* m1, T* v1, T* n1, const T& m2, const T
}
template
<
typename
T
>
inline
__device__
void
ThreadReduce
(
const
int
&
col_dim
,
const
T
*
block_addr
,
T
*
mean
,
T
*
var
,
T
*
num
)
{
inline
__device__
void
ThreadReduce
(
const
int
&
col_dim
,
const
T
*
block_addr
,
T
*
mean
,
T
*
var
,
T
*
num
)
{
int
loop_num
=
(
col_dim
+
NUM_PER_THREAD_REDUCE
-
1
)
/
NUM_PER_THREAD_REDUCE
;
for
(
int
i
=
threadIdx
.
x
;
i
<
loop_num
;
i
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
NUM_PER_THREAD_REDUCE
;
j
++
)
{
...
...
@@ -60,7 +60,7 @@ inline __device__ void ThreadReduce(const int& col_dim, const T* block_addr, T*
}
template
<
typename
T
>
inline
__device__
void
WarpReduce
(
T
*
mean
,
T
*
var
,
T
*
num
)
{
inline
__device__
void
WarpReduce
(
T
*
mean
,
T
*
var
,
T
*
num
)
{
for
(
int
delta
=
(
WARP_SIZE
>>
1
);
delta
>
0
;
delta
>>=
1
)
{
T
mean_other
=
__shfl_down_sync
(
0xffffffff
,
mean
[
0
],
delta
);
T
var_other
=
__shfl_down_sync
(
0xffffffff
,
var
[
0
],
delta
);
...
...
@@ -70,8 +70,8 @@ inline __device__ void WarpReduce(T* mean, T* var, T* num) {
}
template
<
typename
T
>
inline
__device__
void
BlockReduce
(
const
int
&
col_dim
,
T
*
mean
,
T
*
var
,
T
*
num
,
T
*
mean_addr
,
T
*
var_addr
,
T
*
share_mem
)
{
inline
__device__
void
BlockReduce
(
const
int
&
col_dim
,
T
*
mean
,
T
*
var
,
T
*
num
,
T
*
mean_addr
,
T
*
var_addr
,
T
*
share_mem
)
{
if
(
threadIdx
.
x
>=
col_dim
)
{
return
;
}
...
...
@@ -96,15 +96,15 @@ inline __device__ void BlockReduce(const int& col_dim, T* mean, T* var, T* num,
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
mean_addr
[
blockIdx
.
x
]
=
share_mem
[
0
];
// todo: blockDim.x < row
mean_addr
[
blockIdx
.
x
]
=
share_mem
[
0
];
share_mem
[
1
]
/=
col_dim
;
var_addr
[
blockIdx
.
x
]
=
share_mem
[
1
];
}
}
template
<
typename
T
>
inline
__device__
void
LayerNorm
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
*
x
,
const
T
*
share_mem
,
const
T
*
gamma
,
const
T
*
beta
,
const
T
epsilon
,
T
*
y
)
{
inline
__device__
void
LayerNorm
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
*
x
,
const
T
*
share_mem
,
const
T
*
gamma
,
const
T
*
beta
,
const
T
epsilon
,
T
*
y
)
{
for
(
int
col
=
threadIdx
.
x
;
col
<
col_dim
;
col
+=
blockDim
.
x
)
{
int
pos
=
row
*
col_dim
+
col
;
int
i
=
pos
%
param_dim
;
...
...
@@ -113,13 +113,13 @@ inline __device__ void LayerNorm(const int& row, const int& col_dim, const int&
}
template
<
typename
T
>
__global__
void
LayerNormKernel
(
const
int
row_dim
,
const
int
col_dim
,
const
int
param_dim
,
const
T
epsilon
,
const
T
*
x
,
const
T
*
gamma
,
const
T
*
beta
,
T
*
y
,
T
*
mean_addr
,
T
*
var_addr
)
{
__global__
void
LayerNormKernel
(
const
int
row_dim
,
const
int
col_dim
,
const
int
param_dim
,
const
T
epsilon
,
const
T
*
x
,
const
T
*
gamma
,
const
T
*
beta
,
T
*
y
,
T
*
mean_addr
,
T
*
var_addr
)
{
for
(
auto
row
=
blockIdx
.
x
;
row
<
row_dim
;
row
+=
gridDim
.
x
)
{
T
mean
=
0
;
T
var
=
0
;
T
num
=
0
;
const
T
*
block_addr
=
x
+
row
*
col_dim
;
const
T
*
block_addr
=
x
+
row
*
col_dim
;
extern
__shared__
T
share_mem
[];
ThreadReduce
(
col_dim
,
block_addr
,
&
mean
,
&
var
,
&
num
);
...
...
@@ -132,8 +132,8 @@ __global__ void LayerNormKernel(const int row_dim, const int col_dim, const int
}
template
<
typename
T
>
void
LayerNorm
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
const
T
*
x
,
const
T
*
gamma
,
const
T
*
beta
,
T
*
y
,
T
*
mean
,
T
*
var
,
cudaStream_t
stream
)
{
void
LayerNorm
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
const
T
*
x
,
const
T
*
gamma
,
const
T
*
beta
,
T
*
y
,
T
*
mean
,
T
*
var
,
cudaStream_t
stream
)
{
const
dim3
block
(
row_dim
);
const
dim3
thread
(
256
);
// keep the mean/var/num after warp reduce
...
...
@@ -143,6 +143,6 @@ void LayerNorm(const int& row_dim, const int& col_dim, const int& param_dim, con
var
);
}
template
void
LayerNorm
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
float
&
epsilon
,
const
float
*
x
,
const
float
*
gamma
,
const
float
*
beta
,
float
*
y
,
float
*
mean
,
float
*
var
,
template
void
LayerNorm
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
float
&
epsilon
,
const
float
*
x
,
const
float
*
gamma
,
const
float
*
beta
,
float
*
y
,
float
*
mean
,
float
*
var
,
cudaStream_t
stream
);
mindspore/ccsrc/kernel/gpu/data/dataset_iterator_kernel.cc
浏览文件 @
3b09299b
...
...
@@ -96,7 +96,8 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
}
for
(
size_t
i
=
0
;
i
<
output_size_list_
.
size
();
i
++
)
{
CHECK_CUDA_RET_WITH_EXCEPT
(
cudaMemcpyAsync
(
outputs
[
i
]
->
addr
,
addr
,
output_size_list_
[
i
],
cudaMemcpyDeviceToDevice
,
void
*
output_addr
=
GetDeviceAddress
<
void
>
(
outputs
,
i
);
CHECK_CUDA_RET_WITH_EXCEPT
(
cudaMemcpyAsync
(
output_addr
,
addr
,
output_size_list_
[
i
],
cudaMemcpyDeviceToDevice
,
reinterpret_cast
<
cudaStream_t
>
(
stream
)),
"Cuda Memcpy Failed"
);
addr
=
reinterpret_cast
<
unsigned
char
*>
(
addr
)
+
output_size_list_
[
i
];
...
...
mindspore/ccsrc/kernel/gpu/math/broadcast_gpu_kernel.h
浏览文件 @
3b09299b
...
...
@@ -68,14 +68,14 @@ class BroadcastOpGpuKernel : public GpuKernel {
output_shape_
[
i
]
=
shape3
[
i
];
output_num_
*=
shape3
[
i
];
}
int
offset
=
shape3
.
size
()
-
shape1
.
size
();
int
lhs_
offset
=
shape3
.
size
()
-
shape1
.
size
();
for
(
size_t
j
=
0
;
j
<
shape1
.
size
();
j
++
)
{
lhs_shape_
[
j
+
offset
]
=
shape1
[
j
];
lhs_shape_
[
j
+
lhs_
offset
]
=
shape1
[
j
];
input1_num_
*=
shape1
[
j
];
}
offset
=
shape3
.
size
()
-
shape2
.
size
();
int
rhs_
offset
=
shape3
.
size
()
-
shape2
.
size
();
for
(
size_t
k
=
0
;
k
<
shape2
.
size
();
k
++
)
{
rhs_shape_
[
k
+
offset
]
=
shape2
[
k
];
rhs_shape_
[
k
+
rhs_
offset
]
=
shape2
[
k
];
input2_num_
*=
shape2
[
k
];
}
...
...
mindspore/ccsrc/kernel/gpu/math/broadcast_grad_gpu_kernel.h
浏览文件 @
3b09299b
...
...
@@ -74,14 +74,14 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
dy_shape_
[
i
]
=
shape3
[
i
];
output_num_
*=
shape3
[
i
];
}
int
offset
=
shape3
.
size
()
-
shape1
.
size
();
int
x1_
offset
=
shape3
.
size
()
-
shape1
.
size
();
for
(
size_t
i
=
0
;
i
<
shape1
.
size
();
i
++
)
{
x1_shape_
[
i
+
offset
]
=
shape1
[
i
];
x1_shape_
[
i
+
x1_
offset
]
=
shape1
[
i
];
input1_num_
*=
shape1
[
i
];
}
offset
=
shape3
.
size
()
-
shape2
.
size
();
int
x2_
offset
=
shape3
.
size
()
-
shape2
.
size
();
for
(
size_t
i
=
0
;
i
<
shape2
.
size
();
i
++
)
{
x2_shape_
[
i
+
offset
]
=
shape2
[
i
];
x2_shape_
[
i
+
x2_
offset
]
=
shape2
[
i
];
input2_num_
*=
shape2
[
i
];
}
...
...
mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc
浏览文件 @
3b09299b
...
...
@@ -68,14 +68,12 @@ void DropoutGpuFwdKernel::DestroyResource() noexcept {}
void
DropoutGpuFwdKernel
::
InitSizeLists
()
{
size_t
input_size
=
num_count_
*
sizeof
(
float
);
size_t
workspace_size
=
0
;
input_size_list_
.
push_back
(
input_size
);
output_size_list_
.
push_back
(
input_size
);
// output size: the same with input size
output_size_list_
.
push_back
(
input_size
);
// mask size: the same with input size
workspace_size_list_
.
push_back
(
workspace_size
);
}
bool
DropoutGpuFwdKernel
::
Launch
(
const
std
::
vector
<
AddressPtr
>
&
inputs
,
const
std
::
vector
<
AddressPtr
>
&
workspace
,
bool
DropoutGpuFwdKernel
::
Launch
(
const
std
::
vector
<
AddressPtr
>
&
inputs
,
const
std
::
vector
<
AddressPtr
>
&
,
const
std
::
vector
<
AddressPtr
>
&
outputs
,
void
*
stream_ptr
)
{
if
(
is_null_input_
)
{
return
true
;
...
...
mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc
浏览文件 @
3b09299b
...
...
@@ -66,15 +66,13 @@ void DropoutGradGpuFwdKernel::InitSizeLists() {
size_t
dy_size
=
num_count_
*
sizeof
(
float
);
size_t
mask_size
=
dy_size
;
size_t
dx_size
=
dy_size
;
size_t
workspace_size
=
0
;
input_size_list_
.
push_back
(
dy_size
);
input_size_list_
.
push_back
(
mask_size
);
output_size_list_
.
push_back
(
dx_size
);
workspace_size_list_
.
push_back
(
workspace_size
);
}
bool
DropoutGradGpuFwdKernel
::
Launch
(
const
std
::
vector
<
AddressPtr
>
&
inputs
,
const
std
::
vector
<
AddressPtr
>
&
workspace
,
bool
DropoutGradGpuFwdKernel
::
Launch
(
const
std
::
vector
<
AddressPtr
>
&
inputs
,
const
std
::
vector
<
AddressPtr
>
&
,
const
std
::
vector
<
AddressPtr
>
&
outputs
,
void
*
stream_ptr
)
{
if
(
is_null_input_
)
{
return
true
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录