Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
oneflow
提交
7a4151e5
O
oneflow
项目概览
Oneflow-Inc
/
oneflow
上一次同步 接近 3 年
通知
13
Star
2733
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
oneflow
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
7a4151e5
编写于
11月 01, 2021
作者:
X
Xiaoyu Xu
提交者:
GitHub
11月 01, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' into fea/graph_op_debug
上级
3c6d7c99
f88c979a
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
179 addition
and
5 deletion
+179
-5
ci/test/test_speed_multi_client.sh
ci/test/test_speed_multi_client.sh
+4
-4
oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
+78
-0
oneflow/core/functional/functional_api.yaml
oneflow/core/functional/functional_api.yaml
+11
-0
oneflow/core/functional/impl/nn_functor.cpp
oneflow/core/functional/impl/nn_functor.cpp
+44
-0
oneflow/user/kernels/partial_fc_sample_kernel.cu
oneflow/user/kernels/partial_fc_sample_kernel.cu
+4
-1
python/oneflow/__init__.py
python/oneflow/__init__.py
+1
-0
python/oneflow/test/modules/test_parital_fc.py
python/oneflow/test/modules/test_parital_fc.py
+37
-0
未找到文件。
ci/test/test_speed_multi_client.sh
浏览文件 @
7a4151e5
...
...
@@ -18,13 +18,13 @@ function write_to_file_and_print {
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 16x3x224x224
--no-show-memory
--times
100 | check_relative_speed 1.01 | write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 8x3x224x224
--no-show-memory
--times
100 | check_relative_speed 1.05 | write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 4x3x224x224
--no-show-memory
--times
200 | check_relative_speed 1.0
5
| write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 2x3x224x224
--no-show-memory
--times
200 | check_relative_speed 1.0
9
| write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 1x3x224x224
--no-show-memory
--times
200 | check_relative_speed 0.9
5
| write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 4x3x224x224
--no-show-memory
--times
200 | check_relative_speed 1.0
1
| write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 2x3x224x224
--no-show-memory
--times
200 | check_relative_speed 1.0
6
| write_to_file_and_print
python3 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 1x3x224x224
--no-show-memory
--times
200 | check_relative_speed 0.9
4
| write_to_file_and_print
python3
-m
oneflow.distributed.launch
--nproc_per_node
2 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 16x3x224x224
--no-show-memory
--times
100
--ddp
| check_relative_speed 0.99 | write_to_file_and_print
python3
-m
oneflow.distributed.launch
--nproc_per_node
2 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 8x3x224x224
--no-show-memory
--times
100
--ddp
| check_relative_speed 0.99 | write_to_file_and_print
python3
-m
oneflow.distributed.launch
--nproc_per_node
2 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 4x3x224x224
--no-show-memory
--times
200
--ddp
| check_relative_speed 0.9
3
| write_to_file_and_print
python3
-m
oneflow.distributed.launch
--nproc_per_node
2 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 4x3x224x224
--no-show-memory
--times
200
--ddp
| check_relative_speed 0.9
1
| write_to_file_and_print
python3
-m
oneflow.distributed.launch
--nproc_per_node
2 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 2x3x224x224
--no-show-memory
--times
200
--ddp
| check_relative_speed 0.83 | write_to_file_and_print
python3
-m
oneflow.distributed.launch
--nproc_per_node
2 scripts/compare_speed_with_pytorch.py resnet50/models/resnet50.py resnet50 1x3x224x224
--no-show-memory
--times
200
--ddp
| check_relative_speed 0.82 | write_to_file_and_print
...
...
oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
0 → 100644
浏览文件 @
7a4151e5
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/attr_map.h"
#include "oneflow/core/framework/op_expr_grad_function.h"
#include "oneflow/core/functional/functional.h"
namespace
oneflow
{
namespace
one
{
struct
PartialFCSampleState
:
public
AutoGradCaptureState
{
bool
requires_grad
=
false
;
int32_t
index_sampled_label
=
-
1
;
int32_t
index_weight
=
-
1
;
};
class
PartialFCSample
:
public
OpExprGradFunction
<
PartialFCSampleState
>
{
public:
Maybe
<
void
>
Init
(
const
OpExpr
&
op
)
override
;
Maybe
<
void
>
Capture
(
PartialFCSampleState
*
ctx
,
const
TensorTuple
&
inputs
,
const
TensorTuple
&
outputs
,
const
AttrMap
&
attrs
)
const
override
;
Maybe
<
void
>
Apply
(
const
PartialFCSampleState
*
ctx
,
const
TensorTuple
&
out_grads
,
TensorTuple
*
in_grads
)
const
override
;
private:
AttrMap
base_attrs_
;
};
Maybe
<
void
>
PartialFCSample
::
Init
(
const
OpExpr
&
op
)
{
const
UserOpExpr
*
fw_op_expr
=
dynamic_cast
<
const
UserOpExpr
*>
(
&
op
);
CHECK_NOTNULL_OR_RETURN
(
fw_op_expr
);
base_attrs_
=
MakeAttrMapFromUserOpConf
(
fw_op_expr
->
proto
());
return
Maybe
<
void
>::
Ok
();
}
Maybe
<
void
>
PartialFCSample
::
Capture
(
PartialFCSampleState
*
ctx
,
const
TensorTuple
&
inputs
,
const
TensorTuple
&
outputs
,
const
AttrMap
&
attrs
)
const
{
ctx
->
requires_grad
=
inputs
.
at
(
0
)
->
requires_grad
();
if
(
!
ctx
->
requires_grad
)
{
return
Maybe
<
void
>::
Ok
();
}
ctx
->
index_sampled_label
=
ctx
->
SaveTensorForBackward
(
outputs
.
at
(
1
));
// sampled_label
ctx
->
index_weight
=
ctx
->
SaveTensorForBackward
(
inputs
.
at
(
0
));
return
Maybe
<
void
>::
Ok
();
}
Maybe
<
void
>
PartialFCSample
::
Apply
(
const
PartialFCSampleState
*
ctx
,
const
TensorTuple
&
out_grads
,
TensorTuple
*
in_grads
)
const
{
CHECK_EQ_OR_RETURN
(
out_grads
.
size
(),
3
);
in_grads
->
resize
(
1
);
if
(
!
ctx
->
requires_grad
)
{
return
Maybe
<
void
>::
Ok
();
}
const
auto
&
diff_sampled_weight
=
out_grads
.
at
(
2
);
// diff of sampled_weight
const
auto
&
sampled_tensor
=
ctx
->
SavedTensors
().
at
(
ctx
->
index_sampled_label
);
const
auto
&
weight
=
ctx
->
SavedTensors
().
at
(
ctx
->
index_weight
);
const
auto
&
out_tensors_of_op0
=
JUST
(
functional
::
DistributedPariticalFCSampleDisableBoxing
(
diff_sampled_weight
,
sampled_tensor
));
const
auto
&
out_tensors_of_op1
=
JUST
(
functional
::
UnsortedSegmentSumLike
(
out_tensors_of_op0
->
at
(
0
),
out_tensors_of_op0
->
at
(
1
),
weight
,
0
));
in_grads
->
at
(
0
)
=
out_tensors_of_op1
;
return
Maybe
<
void
>::
Ok
();
}
REGISTER_OP_EXPR_GRAD_FUNCTION
(
"distributed_partial_fc_sample"
,
PartialFCSample
);
}
// namespace one
}
// namespace oneflow
oneflow/core/functional/functional_api.yaml
浏览文件 @
7a4151e5
...
...
@@ -1511,6 +1511,17 @@
signature
:
"
TensorTuple
(Tensor
log_probs,
Tensor
input_lengths,
Bool
merge_repeated=True)
=>
CtcGreedyDecoder"
bind_python
:
True
-
name
:
"
distributed_partial_fc_sample"
signature
:
"
TensorTuple
(Tensor
weight,
Tensor
label,
Int64
num_sample)
=>
DistributedPariticalFCSample"
bind_python
:
True
-
name
:
"
distributed_partial_fc_sample_disable_boxing"
signature
:
"
TensorTuple
(Tensor
sampled_weight_diff,
Tensor
sampled_label)
=>
DistributedPariticalFCSampleDisableBoxing"
bind_python
:
False
-
name
:
"
meshgrid"
signature
:
"
TensorTuple
(TensorTuple
tensors)
=>
Meshgrid"
bind_python
:
True
oneflow/core/functional/impl/nn_functor.cpp
浏览文件 @
7a4151e5
...
...
@@ -1872,6 +1872,48 @@ class CtcGreedyDecoderFunctor {
std
::
shared_ptr
<
OpExpr
>
op_
;
};
class
PartialFCSampleFunctor
{
public:
PartialFCSampleFunctor
()
{
op_
=
CHECK_JUST
(
one
::
OpBuilder
(
"distributed_partial_fc_sample"
)
.
Input
(
"weight"
)
.
Input
(
"label"
)
.
Output
(
"mapped_label"
)
.
Output
(
"sampled_label"
)
.
Output
(
"sampled_weight"
)
.
Build
());
}
Maybe
<
TensorTuple
>
operator
()(
const
std
::
shared_ptr
<
one
::
Tensor
>&
wegiht
,
const
std
::
shared_ptr
<
one
::
Tensor
>&
label
,
const
int64_t
&
num_sample
)
const
{
MutableAttrMap
attrs
;
JUST
(
attrs
.
SetAttr
<
int64_t
>
(
"num_sample"
,
num_sample
));
return
OpInterpUtil
::
Dispatch
<
TensorTuple
>
(
*
op_
,
{
wegiht
,
label
},
attrs
);
}
private:
std
::
shared_ptr
<
OpExpr
>
op_
;
};
class
PariticalFCSampleDisableBoxing
{
public:
PariticalFCSampleDisableBoxing
()
{
op_
=
CHECK_JUST
(
one
::
OpBuilder
(
"distributed_partial_fc_sample_disable_boxing"
)
.
Input
(
"sampled_weight_diff"
)
.
Input
(
"sampled_label"
)
.
Output
(
"boxing_disabled_sampled_weight_diff"
)
.
Output
(
"boxing_disabled_sampled_label"
)
.
Build
());
}
Maybe
<
TensorTuple
>
operator
()(
const
std
::
shared_ptr
<
one
::
Tensor
>&
sampled_weight_diff
,
const
std
::
shared_ptr
<
one
::
Tensor
>&
sampled_label
)
const
{
return
OpInterpUtil
::
Dispatch
<
TensorTuple
>
(
*
op_
,
{
sampled_weight_diff
,
sampled_label
});
}
private:
std
::
shared_ptr
<
OpExpr
>
op_
;
};
}
// namespace impl
ONEFLOW_FUNCTION_LIBRARY
(
m
)
{
...
...
@@ -1932,6 +1974,8 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
m
.
add_functor
<
impl
::
FusedBiasAddDropoutFunctor
>
(
"FusedBiasAddDropout"
);
m
.
add_functor
<
impl
::
FusedScaleTrilFunctor
>
(
"FusedScaleTril"
);
m
.
add_functor
<
impl
::
CtcGreedyDecoderFunctor
>
(
"CtcGreedyDecoder"
);
m
.
add_functor
<
impl
::
PartialFCSampleFunctor
>
(
"DistributedPariticalFCSample"
);
m
.
add_functor
<
impl
::
PariticalFCSampleDisableBoxing
>
(
"DistributedPariticalFCSampleDisableBoxing"
);
};
}
// namespace functional
...
...
oneflow/user/kernels/partial_fc_sample_kernel.cu
浏览文件 @
7a4151e5
...
...
@@ -152,7 +152,10 @@ class DistributedPartialFcSampleOpKernelState final : public user_op::OpKernelSt
SetupKernel
<<<
BlocksNum4ThreadsNum
(
num_classes
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
cuda_stream
()
>>>
(
seed
,
curand_states_
);
}
~
DistributedPartialFcSampleOpKernelState
()
{
OF_CUDA_CHECK
(
cudaFree
(
curand_states_
));
};
~
DistributedPartialFcSampleOpKernelState
()
{
cudaError_t
ret
=
cudaFree
(
curand_states_
);
if
(
ret
!=
cudaErrorCudartUnloading
)
{
OF_CUDA_CHECK
(
ret
);
}
};
int64_t
lower
()
const
{
return
lower_
;
}
int64_t
upper
()
const
{
return
upper_
;
}
...
...
python/oneflow/__init__.py
浏览文件 @
7a4151e5
...
...
@@ -128,6 +128,7 @@ from oneflow._C import softplus
from
oneflow._C
import
tril
from
oneflow._C
import
triu
from
oneflow._C
import
pad
from
oneflow._C
import
distributed_partial_fc_sample
from
oneflow._C
import
transpose
from
oneflow._C
import
relu
from
oneflow._C
import
softmax
...
...
python/oneflow/test/modules/test_parital_fc.py
0 → 100644
浏览文件 @
7a4151e5
"""
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import
unittest
from
oneflow.test_utils.automated_test_util
import
*
import
oneflow
as
flow
import
oneflow.unittest
@
unittest
.
skipIf
(
os
.
getenv
(
"ONEFLOW_TEST_CPU_ONLY"
),
"only test cpu cases"
)
class
TestParitalFC
(
flow
.
unittest
.
TestCase
):
def
test_parital_fc
(
test_case
):
p
=
flow
.
env
.
all_device_placement
(
"cuda"
)
w
=
flow
.
randn
(
50000
,
128
,
placement
=
p
,
sbp
=
flow
.
sbp
.
broadcast
)
label
=
flow
.
randint
(
0
,
50000
,
(
512
,),
placement
=
p
,
sbp
=
flow
.
sbp
.
broadcast
)
num_sample
=
5000
out
=
flow
.
distributed_partial_fc_sample
(
w
,
label
,
num_sample
)
test_case
.
assertTrue
(
out
[
0
].
shape
==
flow
.
Size
([
512
]))
test_case
.
assertTrue
(
out
[
1
].
shape
==
flow
.
Size
([
5000
]))
test_case
.
assertTrue
(
out
[
2
].
shape
==
flow
.
Size
([
5000
,
128
]))
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录