Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
779c668a
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
779c668a
编写于
8月 06, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 06, 2020
浏览文件
操作
浏览文件
下载
差异文件
!3991 support updatethorgradient for resnet50 thor GPU
Merge pull request !3991 from mamba_ni/master
上级
235378d5
c1dbc5a0
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
452 addition
and
4 deletion
+452
-4
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cu
...nd/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cu
+100
-0
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cuh
...d/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cuh
+34
-0
mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.cc
.../backend/kernel_compiler/gpu/math/update_thor_gradient.cc
+28
-0
mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h
...c/backend/kernel_compiler/gpu/math/update_thor_gradient.h
+241
-0
mindspore/ops/operations/__init__.py
mindspore/ops/operations/__init__.py
+4
-4
mindspore/ops/operations/_thor_ops.py
mindspore/ops/operations/_thor_ops.py
+45
-0
未找到文件。
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cu
0 → 100644
浏览文件 @
779c668a
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "convert_gradient_impl.cuh"
template
<
typename
T
>
__global__
void
ConvertGradientKernel
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
)
{
for
(
size_t
pointIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
pointIdx
<
(
size
);
pointIdx
+=
blockDim
.
x
*
gridDim
.
x
)
{
size_t
dst_batchIdx
=
pointIdx
/
(
height_h
*
height_w
);
size_t
dst_batchIdxX
=
dst_batchIdx
/
batchwidth
;
size_t
dst_batchIdxY
=
dst_batchIdx
%
batchwidth
;
size_t
dst_x
=
(
pointIdx
-
dst_batchIdx
*
height_h
*
height_w
)
/
height_w
;
size_t
dst_y
=
(
pointIdx
-
dst_batchIdx
*
height_h
*
height_w
)
%
height_w
;
size_t
src_coordinate
=
dst_batchIdxX
*
height_h
*
width
+
dst_x
*
width
+
dst_batchIdxY
*
height_w
+
dst_y
;
output_addr
[
pointIdx
]
=
input_addr
[
src_coordinate
];
}
}
template
<
typename
T
>
__global__
void
ConvertGradientBackKernel
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
)
{
for
(
size_t
pointIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
pointIdx
<
(
size
);
pointIdx
+=
blockDim
.
x
*
gridDim
.
x
)
{
size_t
dst_batchIdx
=
pointIdx
/
(
height_h
*
height_w
);
size_t
dst_batchIdxX
=
dst_batchIdx
/
batchwidth
;
size_t
dst_batchIdxY
=
dst_batchIdx
%
batchwidth
;
size_t
dst_x
=
(
pointIdx
-
dst_batchIdx
*
height_h
*
height_w
)
/
height_w
;
size_t
dst_y
=
(
pointIdx
-
dst_batchIdx
*
height_h
*
height_w
)
%
height_w
;
size_t
src_coordinate
=
dst_batchIdxX
*
height_h
*
width
+
dst_x
*
width
+
dst_batchIdxY
*
height_w
+
dst_y
;
output_addr
[
src_coordinate
]
=
input_addr
[
pointIdx
];
}
}
template
<
typename
T
>
__global__
void
ConvertGradientBackKernel
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
ori_h
,
const
size_t
ori_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
)
{
for
(
size_t
pointIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
pointIdx
<
(
size
);
pointIdx
+=
blockDim
.
x
*
gridDim
.
x
)
{
size_t
dst_batchIdx
=
pointIdx
/
(
height_h
*
height_w
);
size_t
dst_batchIdxX
=
dst_batchIdx
/
batchwidth
;
size_t
dst_batchIdxY
=
dst_batchIdx
%
batchwidth
;
size_t
dst_x
=
(
pointIdx
-
dst_batchIdx
*
height_h
*
height_w
)
/
height_w
;
size_t
dst_y
=
(
pointIdx
-
dst_batchIdx
*
height_h
*
height_w
)
%
height_w
;
size_t
src_x
=
dst_batchIdxX
*
height_h
+
dst_x
;
size_t
src_y
=
dst_batchIdxY
*
height_w
+
dst_y
;
if
(
src_x
<
ori_h
&&
src_y
<
ori_w
)
{
size_t
src_coordinate
=
src_x
*
ori_w
+
src_y
;
output_addr
[
src_coordinate
]
=
input_addr
[
pointIdx
];
}
}
}
template
<
typename
T
>
void
ConvertGradient
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
,
cudaStream_t
cuda_stream
)
{
ConvertGradientKernel
<<<
GET_BLOCKS
(
size
),
GET_THREADS
,
0
,
cuda_stream
>>>
(
size
,
height_h
,
height_w
,
batchwidth
,
width
,
input_addr
,
output_addr
);
}
template
<
typename
T
>
void
ConvertGradientBack
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
,
cudaStream_t
cuda_stream
)
{
ConvertGradientBackKernel
<<<
GET_BLOCKS
(
size
),
GET_THREADS
,
0
,
cuda_stream
>>>
(
size
,
height_h
,
height_w
,
batchwidth
,
width
,
input_addr
,
output_addr
);
}
template
<
typename
T
>
void
ConvertGradientBack
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
ori_h
,
const
size_t
ori_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
,
cudaStream_t
cuda_stream
)
{
ConvertGradientBackKernel
<<<
GET_BLOCKS
(
size
),
GET_THREADS
,
0
,
cuda_stream
>>>
(
size
,
height_h
,
height_w
,
ori_h
,
ori_w
,
batchwidth
,
width
,
input_addr
,
output_addr
);
}
template
void
ConvertGradient
<
float
>(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
float
*
input_addr
,
float
*
output_addr
,
cudaStream_t
cuda_stream
);
template
void
ConvertGradientBack
<
float
>(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
float
*
input_addr
,
float
*
output_addr
,
cudaStream_t
cuda_stream
);
template
void
ConvertGradientBack
<
float
>(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
ori_h
,
const
size_t
ori_w
,
const
size_t
batchwidth
,
const
size_t
width
,
float
*
input_addr
,
float
*
output_addr
,
cudaStream_t
cuda_stream
);
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cuh
0 → 100644
浏览文件 @
779c668a
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
#include "runtime/device/gpu/cuda_common.h"
template
<
typename
T
>
void
ConvertGradient
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
outt_addr
,
cudaStream_t
cuda_stream
);
template
<
typename
T
>
void
ConvertGradientBack
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
,
cudaStream_t
cuda_stream
);
template
<
typename
T
>
void
ConvertGradientBack
(
const
size_t
size
,
const
size_t
height_h
,
const
size_t
height_w
,
const
size_t
ori_h
,
const
size_t
ori_w
,
const
size_t
batchwidth
,
const
size_t
width
,
T
*
input_addr
,
T
*
output_addr
,
cudaStream_t
cuda_stream
);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_CONVERTGRADIENT_H_
mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.cc
0 → 100644
浏览文件 @
779c668a
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/math/update_thor_gradient.h"
namespace
mindspore
{
namespace
kernel
{
MS_REG_GPU_KERNEL_ONE
(
UpdateThorGradient
,
KernelAttr
()
.
AddInputAttr
(
kNumberTypeFloat32
)
.
AddInputAttr
(
kNumberTypeFloat32
)
.
AddInputAttr
(
kNumberTypeFloat32
)
.
AddOutputAttr
(
kNumberTypeFloat32
),
UpdateThorGradientGpuKernel
,
float
)
}
// namespace kernel
}
// namespace mindspore
mindspore/ccsrc/backend/kernel_compiler/gpu/math/update_thor_gradient.h
0 → 100644
浏览文件 @
779c668a
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_UPDATE_THOR_GRADIENT_GPU_KERNEL_H
#define MINDSPORE_UPDATE_THOR_GRADIENT_GPU_KERNEL_H
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/kernel_constants.h"
#include "backend/kernel_compiler/gpu/cuda_impl/convert_gradient_impl.cuh"
#include "backend/kernel_compiler/gpu/cuda_impl/pad_impl.cuh"
#include "utils/convert_utils.h"
namespace
mindspore
{
namespace
kernel
{
struct
GradientSize
{
size_t
batch_h
;
size_t
batch_w
;
size_t
h
;
size_t
w
;
size_t
ori_h
;
size_t
ori_w
;
size_t
pad_h
;
size_t
pad_w
;
bool
need_convert
;
cudaDataType_t
dtype
;
};
template
<
typename
T
>
class
UpdateThorGradientGpuKernel
:
public
GpuKernel
{
public:
UpdateThorGradientGpuKernel
()
:
split_dim
(
128
)
{}
~
UpdateThorGradientGpuKernel
()
=
default
;
const
std
::
vector
<
size_t
>
&
GetInputSizeList
()
const
override
{
return
input_size_list_
;
}
const
std
::
vector
<
size_t
>
&
GetOutputSizeList
()
const
override
{
return
output_size_list_
;
}
const
std
::
vector
<
size_t
>
&
GetWorkspaceSizeList
()
const
override
{
return
workspace_size_list_
;
}
bool
Launch
(
const
std
::
vector
<
AddressPtr
>
&
inputs
,
const
std
::
vector
<
AddressPtr
>
&
workspace
,
const
std
::
vector
<
AddressPtr
>
&
outputs
,
void
*
stream_ptr
)
override
{
auto
input1_addr
=
GetDeviceAddress
<
T
>
(
inputs
,
0
);
auto
input2_addr
=
GetDeviceAddress
<
T
>
(
inputs
,
1
);
auto
input3_addr
=
GetDeviceAddress
<
T
>
(
inputs
,
2
);
auto
workspace1_addr
=
GetDeviceAddress
<
T
>
(
workspace
,
0
);
T
*
workspace2_addr
=
nullptr
;
T
*
workspace3_addr
=
nullptr
;
if
(
gradient_size
.
need_convert
)
{
workspace2_addr
=
GetDeviceAddress
<
T
>
(
workspace
,
1
);
workspace3_addr
=
GetDeviceAddress
<
T
>
(
workspace
,
2
);
}
T
*
workspace4_addr
=
nullptr
;
auto
output_addr
=
GetDeviceAddress
<
T
>
(
outputs
,
0
);
if
(
gradient_size
.
pad_h
!=
0
||
gradient_size
.
pad_w
!=
0
)
{
workspace4_addr
=
GetDeviceAddress
<
T
>
(
workspace
,
3
);
const
size_t
size
=
(
gradient_size
.
ori_h
+
gradient_size
.
pad_h
)
*
(
gradient_size
.
ori_w
+
gradient_size
.
pad_w
);
CalPad
(
size
,
input2_addr
,
1
,
1
,
gradient_size
.
ori_h
,
gradient_size
.
ori_w
,
gradient_size
.
ori_h
+
gradient_size
.
pad_h
,
gradient_size
.
ori_w
+
gradient_size
.
pad_w
,
0
,
0
,
0.0
,
workspace4_addr
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
cudaMemsetAsync
(
workspace1_addr
,
0
,
gradient_size
.
w
*
gradient_size
.
h
*
gradient_size
.
batch_w
*
gradient_size
.
batch_h
*
sizeof
(
T
),
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
input2_addr
=
workspace4_addr
;
}
const
float
alpha
=
1
;
const
float
beta
=
0
;
const
int
lda
=
SizeToInt
(
gradient_size
.
h
);
const
int
ldb
=
SizeToInt
(
gradient_size
.
ori_w
+
gradient_size
.
pad_w
);
const
int
ldc
=
SizeToInt
(
gradient_size
.
ori_w
+
gradient_size
.
pad_w
);
auto
stride_a
=
SizeToInt
(
gradient_size
.
h
*
gradient_size
.
h
);
auto
stride_b
=
SizeToInt
(
gradient_size
.
h
*
(
gradient_size
.
ori_w
+
gradient_size
.
pad_w
));
auto
stride_c
=
SizeToInt
(
gradient_size
.
h
*
(
gradient_size
.
ori_w
+
gradient_size
.
pad_w
));
try
{
CHECK_CUBLAS_RET_WITH_EXCEPT
(
cublasGemmStridedBatchedEx
(
handle_
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
SizeToInt
(
gradient_size
.
ori_w
),
SizeToInt
(
gradient_size
.
h
),
SizeToInt
(
gradient_size
.
h
),
&
alpha
,
input2_addr
,
gradient_size
.
dtype
,
ldb
,
stride_b
,
input1_addr
,
gradient_size
.
dtype
,
lda
,
stride_a
,
&
beta
,
workspace1_addr
,
gradient_size
.
dtype
,
ldc
,
stride_c
,
gradient_size
.
batch_h
,
CUDA_R_32F
,
algo_
),
"cublasSgemm Call Fail"
);
}
catch
(
const
std
::
exception
&
e
)
{
MS_LOG
(
EXCEPTION
)
<<
"Encountered an exception: "
<<
e
.
what
()
<<
"when invoke cubals cublasGemmStridedBatchedEx"
;
}
auto
r_input_addr
=
workspace1_addr
;
if
(
gradient_size
.
need_convert
)
{
size_t
size
=
gradient_size
.
batch_w
*
gradient_size
.
batch_h
*
gradient_size
.
w
*
gradient_size
.
h
;
ConvertGradient
(
size
,
gradient_size
.
h
,
gradient_size
.
w
,
gradient_size
.
batch_w
,
gradient_size
.
batch_w
*
gradient_size
.
w
,
workspace1_addr
,
workspace2_addr
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
r_input_addr
=
workspace2_addr
;
}
const
int
lda_r
=
SizeToInt
(
gradient_size
.
w
);
const
int
ldb_r
=
SizeToInt
(
gradient_size
.
w
);
const
int
ldc_r
=
SizeToInt
(
gradient_size
.
w
);
stride_a
=
SizeToInt
(
gradient_size
.
h
*
gradient_size
.
w
);
stride_b
=
SizeToInt
(
gradient_size
.
w
*
gradient_size
.
w
);
stride_c
=
SizeToInt
(
gradient_size
.
h
*
gradient_size
.
w
);
auto
r_output_addr
=
output_addr
;
if
(
gradient_size
.
need_convert
)
{
r_output_addr
=
workspace3_addr
;
}
CHECK_CUBLAS_RET_WITH_EXCEPT
(
cublasGemmStridedBatchedEx
(
handle_
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
SizeToInt
(
gradient_size
.
w
),
SizeToInt
(
gradient_size
.
h
),
SizeToInt
(
gradient_size
.
w
),
&
alpha
,
input3_addr
,
gradient_size
.
dtype
,
ldb_r
,
stride_b
,
r_input_addr
,
gradient_size
.
dtype
,
lda_r
,
stride_a
,
&
beta
,
r_output_addr
,
gradient_size
.
dtype
,
ldc_r
,
stride_c
,
gradient_size
.
batch_h
*
gradient_size
.
batch_w
,
CUDA_R_32F
,
algo_
),
"cublasSgemm Call Fail"
);
if
(
gradient_size
.
need_convert
)
{
size_t
size
=
gradient_size
.
batch_w
*
gradient_size
.
batch_h
*
gradient_size
.
w
*
gradient_size
.
h
;
if
(
gradient_size
.
pad_h
==
0
&&
gradient_size
.
pad_w
==
0
)
{
ConvertGradientBack
(
size
,
gradient_size
.
h
,
gradient_size
.
w
,
gradient_size
.
batch_w
,
gradient_size
.
batch_w
*
gradient_size
.
w
,
r_output_addr
,
output_addr
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
else
{
ConvertGradientBack
(
size
,
gradient_size
.
h
,
gradient_size
.
w
,
gradient_size
.
ori_h
,
gradient_size
.
ori_w
,
gradient_size
.
batch_w
,
gradient_size
.
ori_w
,
r_output_addr
,
output_addr
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
}
return
true
;
}
bool
Init
(
const
CNodePtr
&
kernel_node
)
override
{
handle_
=
device
::
gpu
::
GPUDeviceManager
::
GetInstance
().
GetCublasHandle
();
SetProperty
(
kernel_node
);
InitSizeLists
();
return
true
;
}
protected:
void
InitSizeLists
()
override
{
size_t
unit_size
=
sizeof
(
T
);
size_t
input_size_
=
gradient_size
.
h
*
gradient_size
.
h
*
gradient_size
.
batch_h
*
unit_size
;
input_size_list_
.
push_back
(
input_size_
);
input_size_
=
gradient_size
.
ori_h
*
gradient_size
.
ori_w
*
unit_size
;
input_size_list_
.
push_back
(
input_size_
);
input_size_
=
gradient_size
.
w
*
gradient_size
.
w
*
gradient_size
.
batch_w
*
unit_size
;
input_size_list_
.
push_back
(
input_size_
);
size_t
output_size
=
gradient_size
.
ori_h
*
gradient_size
.
ori_w
*
unit_size
;
output_size_list_
.
push_back
(
output_size
);
size_t
workspace_size_
=
0
;
workspace_size_
=
gradient_size
.
w
*
gradient_size
.
h
*
gradient_size
.
batch_w
*
gradient_size
.
batch_h
*
unit_size
;
workspace_size_list_
.
push_back
(
workspace_size_
);
if
(
gradient_size
.
need_convert
)
{
workspace_size_
=
gradient_size
.
w
*
gradient_size
.
h
*
gradient_size
.
batch_w
*
gradient_size
.
batch_h
*
unit_size
;
workspace_size_list_
.
push_back
(
workspace_size_
);
workspace_size_
=
gradient_size
.
w
*
gradient_size
.
h
*
gradient_size
.
batch_w
*
gradient_size
.
batch_h
*
unit_size
;
workspace_size_list_
.
push_back
(
workspace_size_
);
}
if
(
gradient_size
.
pad_h
!=
0
||
gradient_size
.
pad_w
!=
0
)
{
workspace_size_
=
(
gradient_size
.
ori_w
+
gradient_size
.
pad_w
)
*
(
gradient_size
.
ori_h
+
gradient_size
.
pad_h
)
*
unit_size
;
workspace_size_list_
.
push_back
(
workspace_size_
);
}
}
private:
void
SetProperty
(
const
CNodePtr
&
kernel_node
)
{
auto
matrix_a_shape
=
AnfAlgo
::
GetPrevNodeOutputInferShape
(
kernel_node
,
0
);
auto
gradient_shape
=
AnfAlgo
::
GetPrevNodeOutputInferShape
(
kernel_node
,
1
);
auto
matrix_g_shape
=
AnfAlgo
::
GetPrevNodeOutputInferShape
(
kernel_node
,
2
);
split_dim
=
size_t
(
GetAttr
<
int
>
(
kernel_node
,
"split_dim"
));
gradient_size
.
batch_h
=
gradient_shape
[
0
]
/
split_dim
;
gradient_size
.
batch_w
=
gradient_shape
[
1
]
/
split_dim
;
if
(
gradient_size
.
batch_h
*
split_dim
!=
gradient_shape
[
0
])
{
gradient_size
.
batch_h
+=
1
;
if
(
gradient_shape
[
0
]
>
split_dim
)
{
gradient_size
.
h
=
split_dim
;
gradient_size
.
pad_h
=
gradient_size
.
batch_h
*
split_dim
-
gradient_shape
[
0
];
}
else
{
gradient_size
.
h
=
gradient_shape
[
0
];
gradient_size
.
pad_h
=
0
;
}
}
else
{
gradient_size
.
h
=
split_dim
;
gradient_size
.
pad_h
=
0
;
}
if
(
gradient_size
.
batch_w
*
split_dim
!=
gradient_shape
[
1
])
{
gradient_size
.
batch_w
+=
1
;
if
(
gradient_shape
[
1
]
>
split_dim
)
{
gradient_size
.
w
=
split_dim
;
gradient_size
.
pad_w
=
gradient_size
.
batch_w
*
split_dim
-
gradient_shape
[
1
];
}
else
{
gradient_size
.
w
=
gradient_shape
[
1
];
gradient_size
.
pad_w
=
0
;
}
}
else
{
gradient_size
.
w
=
split_dim
;
gradient_size
.
pad_w
=
0
;
}
if
(
gradient_size
.
batch_w
*
gradient_size
.
w
<=
split_dim
)
{
gradient_size
.
need_convert
=
false
;
}
else
{
gradient_size
.
need_convert
=
true
;
}
gradient_size
.
ori_w
=
gradient_shape
[
1
];
gradient_size
.
ori_h
=
gradient_shape
[
0
];
gradient_size
.
dtype
=
GetCudaDataType
(
TypeIdLabel
(
AnfAlgo
::
GetInputDeviceDataType
(
kernel_node
,
1
)));
}
size_t
split_dim
;
struct
GradientSize
gradient_size
;
cublasHandle_t
handle_
;
cublasGemmAlgo_t
algo_
=
CUBLAS_GEMM_DEFAULT
;
std
::
vector
<
size_t
>
input_size_list_
;
std
::
vector
<
size_t
>
output_size_list_
;
std
::
vector
<
size_t
>
workspace_size_list_
;
};
}
// namespace kernel
}
// namespace mindspore
#endif
mindspore/ops/operations/__init__.py
浏览文件 @
779c668a
...
...
@@ -83,10 +83,10 @@ from . import _quant_ops
from
._quant_ops
import
*
from
.other_ops
import
(
Assign
,
IOU
,
BoundingBoxDecode
,
BoundingBoxEncode
,
PopulationCount
,
CheckValid
,
MakeRefKey
,
Partial
,
Depend
,
CheckBprop
,
Push
,
Pull
)
from
.thor_ops
import
(
CusBatchMatMul
,
CusCholeskyTrsm
,
CusFusedAbsMax1
,
CusImg2Col
,
CusMatMulCubeDenseLeft
,
CusMatMulCubeFraczRightMul
,
CusMatMulCube
,
CusMatrixCombine
,
CusTranspose02314
,
CusMatMulCubeDenseRight
,
CusMatMulCubeFraczLeftCast
,
Im2Col
)
from
.
_
thor_ops
import
(
CusBatchMatMul
,
CusCholeskyTrsm
,
CusFusedAbsMax1
,
CusImg2Col
,
CusMatMulCubeDenseLeft
,
CusMatMulCubeFraczRightMul
,
CusMatMulCube
,
CusMatrixCombine
,
CusTranspose02314
,
CusMatMulCubeDenseRight
,
CusMatMulCubeFraczLeftCast
,
Im2Col
,
UpdateThorGradient
)
from
.sparse_ops
import
SparseToDense
__all__
=
[
...
...
mindspore/ops/operations/thor_ops.py
→
mindspore/ops/operations/
_
thor_ops.py
浏览文件 @
779c668a
...
...
@@ -562,3 +562,48 @@ class Im2Col(PrimitiveWithInfer):
valid_types
=
[
mstype
.
float16
,
mstype
.
float32
]
validator
.
check_tensor_type_same
(
args
,
valid_types
,
self
.
name
)
return
x_dtype
class
UpdateThorGradient
(
PrimitiveWithInfer
):
"""
Update Thor Gradient with Approximate Fisher info matrix(for GPU backend).
The rank of input_x1 must be `3`, which indicates the A matrix.
The rank of input_x2 must be `2`, which indicates the 1st-order gradient.
The rank of input_x3 must be `4`, which indicates the G matrix.
Inputs:
- **input_x1** (Tensor) - The first input is the diag part of the cov matrix of feature map.
Supported dtype [float32].
- **input_x2** (Tensor) - The second input is the corresponding 1st-order grad. Supported dtype [float32].
- **input_x3** (Tensor) - The third input is the diag part of the cov matrix of dout. Supported dtype [float32].
Outputs:
Tensor, the shape is the same as the shape of input_x2, it will be used to update the weights.
Examples:
>>> input_x1 = Tensor(np.random.rand(16, 128, 128).astype(np.float32))
>>> input_x2 = Tensor(np.random.rand(2048, 1024).astype(np.float32))
>>> temp_x3 = np.random.rand(8, 128, 128).astype(np.float32)
>>> input_x3 = np.zeros(16,8,128,128).astype(np.float32)
>>> for i in range(16):
>>> input_x3[i,:,:,:] = temp_x3
>>> input_x3 = Tensor(input_x3)
>>> update_thor_gradient = P.UpdateThorGradient(split_dim=128)
>>> output = update_thor_gradient(input_x1, input_x2, input_x3)
"""
@
prim_attr_register
def
__init__
(
self
,
split_dim
=
0
):
"""init UpdateThorGradient"""
self
.
init_prim_io_names
(
inputs
=
[
'x1'
,
'x2'
,
'x3'
],
outputs
=
[
'y'
])
self
.
split_dim
=
split_dim
self
.
add_prim_attr
(
'split_dim'
,
self
.
split_dim
)
def
infer_shape
(
self
,
x1_shape
,
x2_shape
,
x3_shape
):
return
x2_shape
def
infer_dtype
(
self
,
x1_dtype
,
x2_dtype
,
x3_dtype
):
validator
.
check_tensor_type_same
({
'x1_dtype'
:
x1_dtype
,
'x2_dtype'
:
x2_dtype
,
'x3_dtype'
:
x3_dtype
},
[
mstype
.
float32
],
self
.
name
)
return
x2_dtype
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录