Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
oneflow
提交
03295996
O
oneflow
项目概览
Oneflow-Inc
/
oneflow
上一次同步 2 年多
通知
13
Star
2733
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
oneflow
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
03295996
编写于
12月 09, 2017
作者:
W
willzhang4a58
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
blasaxpy for int
Former-commit-id:
2d8c7f9c
上级
a3ce4658
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
96 addition
and
134 deletion
+96
-134
oneflow/core/kernel/boxing_kernel.cpp
oneflow/core/kernel/boxing_kernel.cpp
+85
-90
oneflow/core/kernel/boxing_kernel.h
oneflow/core/kernel/boxing_kernel.h
+1
-32
oneflow/core/kernel/kernel_util.cpp
oneflow/core/kernel/kernel_util.cpp
+9
-11
oneflow/core/kernel/kernel_util.cu
oneflow/core/kernel/kernel_util.cu
+1
-1
未找到文件。
oneflow/core/kernel/boxing_kernel.cpp
浏览文件 @
03295996
...
...
@@ -5,28 +5,27 @@
namespace
oneflow
{
template
<
typename
T
>
void
BoxingKernel
<
T
>::
GetSumFromSrcBlobsToDstBlob
(
const
KernelCtx
&
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
,
const
std
::
vector
<
std
::
string
>&
src_bns
,
const
std
::
string
&
dst_bn
)
const
{
void
CalcSumOfBlobs
(
DeviceCtx
*
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
,
const
PbRpf
<
std
::
string
>&
src_bns
,
const
std
::
string
&
dst_bn
)
{
Blob
*
dst_blob
=
BnInOp2Blob
(
dst_bn
);
const
Blob
*
src_blob_0
=
BnInOp2Blob
(
src_bns
[
0
]);
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
.
device_ctx
,
dst_blob
->
mut_memory_ptr
(),
src_blob_0
->
memory_ptr
(),
ctx
,
dst_blob
->
mut_memory_ptr
(),
src_blob_0
->
memory_ptr
(),
src_blob_0
->
TotalByteSize
(),
cudaMemcpyKind
::
cudaMemcpyHostToHost
);
FOR_RANGE
(
size_t
,
i
,
1
,
src_bns
.
size
())
{
Blob
*
src_blob_i
=
BnInOp2Blob
(
"in_"
+
std
::
to_string
(
i
));
KernelUtil
<
DeviceType
::
kCPU
,
T
>::
BlasAxpy
(
ctx
.
device_ctx
,
dst_blob
->
shape
().
elem_cnt
(),
1.0
,
src_blob_i
->
dptr
<
T
>
(),
1
,
dst_blob
->
mut_dptr
<
T
>
(),
1
);
KernelUtil
<
DeviceType
::
kCPU
,
T
>::
Axpy
(
ctx
,
dst_blob
->
shape
().
elem_cnt
(),
1.0
,
src_blob_i
->
dptr
<
T
>
(),
1
,
dst_blob
->
mut_dptr
<
T
>
(),
1
);
}
}
template
<
typename
T
>
void
BoxingKernel
<
T
>::
CopyDataId
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
int32_t
src_axis
,
const
int32_t
dst_axis
)
const
{
void
CopyDataId
(
DeviceCtx
*
ctx
,
PbRpf
<
Blob
*>&
src_blobs
,
PbRpf
<
Blob
*>&
dst_blobs
,
const
int32_t
src_axis
,
const
int32_t
dst_axis
)
{
size_t
data_id_bytesize
=
JobDesc
::
Singleton
()
->
SizeOfOneDataId
();
int64_t
src_idx
=
0
;
int64_t
dst_idx
=
0
;
...
...
@@ -38,8 +37,7 @@ void BoxingKernel<T>::CopyDataId(const KernelCtx& ctx,
int64_t
copy_num
=
std
::
min
(
src_dim
-
src_offset
,
dst_dim
-
dst_offset
);
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
.
device_ctx
,
dst_blobs
[
dst_idx
]
->
mut_data_id
()
+
dst_offset
*
data_id_bytesize
,
ctx
,
dst_blobs
[
dst_idx
]
->
mut_data_id
()
+
dst_offset
*
data_id_bytesize
,
src_blobs
[
src_idx
]
->
data_id
()
+
src_offset
*
data_id_bytesize
,
copy_num
*
data_id_bytesize
,
cudaMemcpyKind
::
cudaMemcpyHostToHost
);
...
...
@@ -61,7 +59,7 @@ void BoxingKernel<T>::CopyDataId(const KernelCtx& ctx,
if
(
dst_axis
>
0
)
{
CHECK_EQ
(
dst_idx
,
1
);
FOR_RANGE
(
size_t
,
i
,
1
,
dst_blobs
.
size
())
{
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
.
device_ctx
,
dst_blobs
[
i
]
->
mut_data_id
(),
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
,
dst_blobs
[
i
]
->
mut_data_id
(),
dst_blobs
[
0
]
->
data_id
(),
dst_blobs
[
0
]
->
ByteSizeOfDataIdField
(),
cudaMemcpyKind
::
cudaMemcpyHostToHost
);
...
...
@@ -70,30 +68,28 @@ void BoxingKernel<T>::CopyDataId(const KernelCtx& ctx,
}
template
<
typename
T
>
void
BoxingKernel
<
T
>::
DoUnequalAxisCopy
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
BoxingInfo
&
src_info
,
const
BoxingInfo
&
dst_info
,
bool
need_swap
)
const
{
std
::
vector
<
int64_t
>
dst_blob_offset
(
dst_blobs
.
size
(),
0
);
void
DoUnequalAxisCopy
(
DeviceCtx
*
ctx
,
PbRpf
<
Blob
*>&
src_blobs
,
PbRpf
<
Blob
*>&
dst_blobs
,
const
BoxingInfo
&
src_info
,
const
BoxingInfo
&
dst_info
,
bool
need_swap
)
{
PbRpf
<
int64_t
>
dst_blob_offset
;
// (dst_blobs.size(), static_cast<int64_t>(0));
FOR_RANGE
(
size_t
,
src_idx
,
0
,
src_blobs
.
size
())
{
int64_t
dst_segs_in_src_seg
=
src_info
.
size_of_subseg
(
src_idx
)
/
dst_info
.
size_of_per_seg
();
src_info
.
size_of_subseg
(
src_idx
)
/
dst_info
.
one_seg_size
();
int64_t
src_seg_offset
=
0
;
FOR_RANGE
(
size_t
,
dst_seg_idx
,
0
,
dst_segs_in_src_seg
)
{
FOR_RANGE
(
size_t
,
dst_idx
,
0
,
dst_blobs
.
size
())
{
size_t
copy_bytesize
=
dst_info
.
size_of_subseg
(
dst_idx
)
*
sizeof
(
T
);
if
(
need_swap
)
{
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
.
device_ctx
,
ctx
,
src_blobs
[
src_idx
]
->
mut_dptr
<
char
>
()
+
src_seg_offset
*
sizeof
(
T
),
dst_blobs
[
dst_idx
]
->
dptr
<
char
>
()
+
dst_blob_offset
[
dst_idx
]
*
sizeof
(
T
),
copy_bytesize
,
cudaMemcpyKind
::
cudaMemcpyHostToHost
);
}
else
{
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
.
device_ctx
,
ctx
,
dst_blobs
[
dst_idx
]
->
mut_dptr
<
char
>
()
+
dst_blob_offset
[
dst_idx
]
*
sizeof
(
T
),
src_blobs
[
src_idx
]
->
dptr
<
char
>
()
+
src_seg_offset
*
sizeof
(
T
),
...
...
@@ -107,29 +103,25 @@ void BoxingKernel<T>::DoUnequalAxisCopy(const KernelCtx& ctx,
}
template
<
typename
T
>
void
BoxingKernel
<
T
>::
BoxingCopyForUnequalAxis
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
int32_t
src_axis
,
const
int32_t
dst_axis
)
const
{
const
BoxingKernelConf
&
kernel_conf
=
this
->
kernel_conf
().
boxing_conf
();
void
BoxingCopyForUnequalAxis
(
DeviceCtx
*
ctx
,
PbRpf
<
Blob
*>&
src_blobs
,
PbRpf
<
Blob
*>&
dst_blobs
,
const
int32_t
src_axis
,
const
int32_t
dst_axis
)
{
BoxingKernelConf
kernel_conf
;
//= this->kernel_conf().boxing_conf();
const
BoxingInfo
&
in_info
=
kernel_conf
.
in_info
();
const
BoxingInfo
&
out_info
=
kernel_conf
.
out_info
();
if
(
src_axis
>
dst_axis
)
{
CHECK_EQ
(
dst_axis
,
0
);
DoUnequalAxisCopy
(
ctx
,
dst_blobs
,
src_blobs
,
out_info
,
in_info
,
true
);
DoUnequalAxisCopy
<
T
>
(
ctx
,
dst_blobs
,
src_blobs
,
out_info
,
in_info
,
true
);
}
else
{
CHECK_EQ
(
src_axis
,
0
);
DoUnequalAxisCopy
(
ctx
,
src_blobs
,
dst_blobs
,
in_info
,
out_info
,
false
);
DoUnequalAxisCopy
<
T
>
(
ctx
,
src_blobs
,
dst_blobs
,
in_info
,
out_info
,
false
);
}
}
template
<
typename
T
>
void
BoxingKernel
<
T
>::
BoxingCopyForEqualAxis
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
int32_t
axis
)
const
{
const
BoxingKernelConf
&
kernel_conf
=
this
->
kernel_conf
().
boxing_conf
();
void
BoxingCopyForEqualAxis
(
DeviceCtx
*
ctx
,
PbRpf
<
Blob
*>&
src_blobs
,
PbRpf
<
Blob
*>&
dst_blobs
,
const
int32_t
axis
)
{
BoxingKernelConf
kernel_conf
;
// = this->kernel_conf().boxing_conf();
const
BoxingInfo
&
in_info
=
kernel_conf
.
in_info
();
const
BoxingInfo
&
out_info
=
kernel_conf
.
out_info
();
int64_t
src_offset
=
0
;
...
...
@@ -142,8 +134,7 @@ void BoxingKernel<T>::BoxingCopyForEqualAxis(const KernelCtx& ctx,
out_info
.
size_of_subseg
(
dst_idx
)
-
dst_offset
);
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
.
device_ctx
,
dst_blobs
[
dst_idx
]
->
mut_dptr
<
char
>
()
+
dst_offset
*
sizeof
(
T
),
ctx
,
dst_blobs
[
dst_idx
]
->
mut_dptr
<
char
>
()
+
dst_offset
*
sizeof
(
T
),
src_blobs
[
src_idx
]
->
dptr
<
char
>
()
+
src_offset
*
sizeof
(
T
),
copy_num
*
sizeof
(
T
),
cudaMemcpyKind
::
cudaMemcpyHostToHost
);
src_offset
+=
copy_num
;
...
...
@@ -164,74 +155,89 @@ void BoxingKernel<T>::BoxingCopyForEqualAxis(const KernelCtx& ctx,
}
template
<
typename
T
>
void
BoxingKernel
<
T
>::
CopyFromSrcBlobs2DstBlobs
(
const
KernelCtx
&
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
,
const
std
::
vector
<
std
::
string
>&
src_bns
,
const
std
::
vector
<
std
::
string
>&
dst_bns
,
const
int32_t
src_axis
,
const
int32_t
dst_axis
)
const
{
std
::
vector
<
Blob
*>
src_blobs
;
std
::
vector
<
Blob
*>
dst_blobs
;
void
ConcatSplitBlobs
(
DeviceCtx
*
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
,
const
PbRpf
<
std
::
string
>&
src_bns
,
const
PbRpf
<
std
::
string
>&
dst_bns
,
const
BoxConcatConf
&
,
const
BoxSplitConf
&
)
{
PbRpf
<
Blob
*>
src_blobs
;
PbRpf
<
Blob
*>
dst_blobs
;
const
int32_t
src_axis
=
-
1
;
const
int32_t
dst_axis
=
-
1
;
for
(
const
std
::
string
&
bn
:
src_bns
)
{
src_blobs
.
emplace
_back
(
BnInOp2Blob
(
bn
));
// src_blobs.push
_back(BnInOp2Blob(bn));
}
for
(
const
std
::
string
&
bn
:
dst_bns
)
{
dst_blobs
.
emplace
_back
(
BnInOp2Blob
(
bn
));
// dst_blobs.push
_back(BnInOp2Blob(bn));
}
if
(
src_blobs
[
0
]
->
has_data_id
())
{
CopyDataId
(
ctx
,
src_blobs
,
dst_blobs
,
src_axis
,
dst_axis
);
CopyDataId
<
T
>
(
ctx
,
src_blobs
,
dst_blobs
,
src_axis
,
dst_axis
);
}
if
(
src_axis
==
dst_axis
)
{
BoxingCopyForEqualAxis
(
ctx
,
src_blobs
,
dst_blobs
,
src_axis
);
BoxingCopyForEqualAxis
<
T
>
(
ctx
,
src_blobs
,
dst_blobs
,
src_axis
);
}
else
{
BoxingCopyForUnequalAxis
(
ctx
,
src_blobs
,
dst_blobs
,
src_axis
,
dst_axis
);
BoxingCopyForUnequalAxis
<
T
>
(
ctx
,
src_blobs
,
dst_blobs
,
src_axis
,
dst_axis
);
}
}
template
<
typename
T
>
void
BoxingKernel
<
T
>::
CopyFromFirstBlob2
OtherBlobs
(
const
KernelCtx
&
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
,
const
std
::
vector
<
std
::
string
>&
obns
)
const
{
void
CopyFromFirstTo
OtherBlobs
(
DeviceCtx
*
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
,
const
PbRpf
<
std
::
string
>&
obns
)
{
const
Blob
*
out_blob_0
=
BnInOp2Blob
(
obns
[
0
]);
FOR_RANGE
(
size_t
,
i
,
1
,
obns
.
size
())
{
Memcpy
<
DeviceType
::
kCPU
>
(
ctx
.
device_ctx
,
BnInOp2Blob
(
obns
[
i
])
->
mut_memory_ptr
(),
out_blob_0
->
memory_ptr
(),
out_blob_0
->
TotalByteSize
(),
cudaMemcpyKind
::
cudaMemcpyHostToHost
);
ctx
,
BnInOp2Blob
(
obns
[
i
])
->
mut_memory_ptr
(),
out_blob_0
->
memory_ptr
(),
out_blob_0
->
TotalByteSize
(),
cudaMemcpyKind
::
cudaMemcpyHostToHost
);
}
}
PbRpf
<
std
::
string
>
ConstructPbRpf
(
const
std
::
string
&
s
)
{
PbRpf
<
std
::
string
>
ret
;
ret
.
Reserve
(
1
);
ret
.
Add
()
->
assign
(
s
);
return
ret
;
}
template
<
typename
T
>
void
BoxingKernel
<
T
>::
Forward
(
const
KernelCtx
&
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
BnInOp2Blob
)
const
{
auto
boxing_conf
=
op_conf
().
boxing_conf
();
const
KernelConf
&
kernel_conf
=
this
->
kernel_conf
(
);
const
BoxingOpConf
&
boxing_conf
=
op_conf
().
boxing_conf
();
const
std
::
string
&
obn_0
=
kernel_conf
().
output_bns
(
0
);
if
(
boxing_conf
.
in_box_case
()
==
BoxingOpConf
::
kConcatBox
)
{
// concat-box copy rules: copy directly from input to output
int32_t
concat_axis
=
boxing_conf
.
concat_box
().
axis
();
if
(
boxing_conf
.
out_box_case
()
==
BoxingOpConf
::
kSplitBox
)
{
Co
pyFromSrcBlobs2DstBlobs
(
ctx
,
BnInOp2Blob
,
kernel_conf
.
input_bns
()
,
kernel_conf
.
output_bns
(),
concat_axis
,
boxing_conf
.
split_box
().
axis
());
Co
ncatSplitBlobs
<
T
>
(
ctx
.
device_ctx
,
BnInOp2Blob
,
kernel_conf
().
input_bns
(),
kernel_conf
().
output_bns
()
,
boxing_conf
.
concat_box
(),
boxing_conf
.
split_box
());
}
else
if
(
boxing_conf
.
out_box_case
()
==
BoxingOpConf
::
kCloneBox
)
{
CopyFromSrcBlobs2DstBlobs
(
ctx
,
BnInOp2Blob
,
kernel_conf
.
input_bns
(),
{
"out_0"
},
concat_axis
,
0
);
CopyFromFirstBlob2OtherBlobs
(
ctx
,
BnInOp2Blob
,
kernel_conf
.
output_bns
());
const
Blob
*
ob_0
=
BnInOp2Blob
(
obn_0
);
BoxSplitConf
split_box
;
split_box
.
set_axis
(
0
);
split_box
.
add_part_num
(
ob_0
->
shape
().
At
(
0
));
ConcatSplitBlobs
<
T
>
(
ctx
.
device_ctx
,
BnInOp2Blob
,
kernel_conf
().
input_bns
(),
ConstructPbRpf
(
obn_0
),
boxing_conf
.
concat_box
(),
split_box
);
CopyFromFirstToOtherBlobs
<
T
>
(
ctx
.
device_ctx
,
BnInOp2Blob
,
kernel_conf
().
output_bns
());
}
else
{
UNEXPECTED_RUN
();
}
}
else
if
(
boxing_conf
.
in_box_case
()
==
BoxingOpConf
::
kAddBox
)
{
if
(
boxing_conf
.
out_box_case
()
==
BoxingOpConf
::
kSplitBox
)
{
GetSumFromSrcBlobsToDstBlob
(
ctx
,
BnInOp2Blob
,
kernel_conf
.
input_bns
(),
{
"middle"
});
CopyFromSrcBlobs2DstBlobs
(
ctx
,
BnInOp2Blob
,
{
"middle"
},
kernel_conf
.
output_bns
(),
0
,
boxing_conf
.
split_box
().
axis
());
}
else
if
(
boxing_conf
.
in_box_case
()
==
BoxingOpConf
::
kCloneBox
)
{
GetSumFromSrcBlobsToDstBlob
(
ctx
,
BnInOp2Blob
,
kernel_conf
.
input_bns
(),
{
kernel_conf
.
output_bns
(
0
)});
CopyFromFirstBlob2OtherBlobs
(
ctx
,
BnInOp2Blob
,
kernel_conf
.
output_bns
());
CalcSumOfBlobs
<
T
>
(
ctx
.
device_ctx
,
BnInOp2Blob
,
kernel_conf
().
input_bns
(),
"middle"
);
BoxConcatConf
concat_box
;
concat_box
.
set_axis
(
0
);
ConcatSplitBlobs
<
T
>
(
ctx
.
device_ctx
,
BnInOp2Blob
,
ConstructPbRpf
(
"middle"
),
kernel_conf
().
output_bns
(),
concat_box
,
boxing_conf
.
split_box
());
}
else
if
(
boxing_conf
.
out_box_case
()
==
BoxingOpConf
::
kCloneBox
)
{
CalcSumOfBlobs
<
T
>
(
ctx
.
device_ctx
,
BnInOp2Blob
,
kernel_conf
().
input_bns
(),
obn_0
);
CopyFromFirstToOtherBlobs
<
T
>
(
ctx
.
device_ctx
,
BnInOp2Blob
,
kernel_conf
().
output_bns
());
}
else
{
UNEXPECTED_RUN
();
}
...
...
@@ -240,18 +246,7 @@ void BoxingKernel<T>::Forward(
}
}
namespace
{
Kernel
*
CreateBoxingKernel
(
const
KernelConf
&
kernel_conf
)
{
static
const
HashMap
<
std
::
string
,
std
::
function
<
Kernel
*
()
>>
creators
=
{
#define BOXING_KERNEL_ENTRY(data_type_pair) \
{GetHashKey(OF_PP_PAIR_SECOND(data_type_pair)), \
[]() { return new BoxingKernel<OF_PP_PAIR_FIRST(data_type_pair)>(); }},
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
BOXING_KERNEL_ENTRY
,
ARITHMETIC_DATA_TYPE_SEQ
)};
return
creators
.
at
(
GetHashKey
(
JobDesc
::
Singleton
()
->
DefaultDataType
()))();
}
}
// namespace
ADD_CPU_DEFAULT_KERNEL_CREATOR
(
OperatorConf
::
kBoxingConf
,
BoxingKernel
,
ARITHMETIC_DATA_TYPE_SEQ
);
}
// namespace oneflow
oneflow/core/kernel/boxing_kernel.h
浏览文件 @
03295996
...
...
@@ -2,7 +2,6 @@
#define ONEFLOW_CORE_KERNEL_BOXING_KERNEL_H_
#include "oneflow/core/kernel/kernel.h"
#include "oneflow/core/kernel/kernel_context.h"
namespace
oneflow
{
...
...
@@ -13,39 +12,9 @@ class BoxingKernel final : public KernelIf<DeviceType::kCPU> {
BoxingKernel
()
=
default
;
~
BoxingKernel
()
=
default
;
private:
void
Forward
(
const
KernelCtx
&
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
)
const
override
;
private:
void
GetSumFromSrcBlobsToDstBlob
(
const
KernelCtx
&
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
,
const
std
::
vector
<
std
::
string
>&
src_bns
,
const
std
::
string
&
dst_bn
)
const
;
void
CopyDataId
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
int32_t
src_concat_axis
,
const
int32_t
dst_split_axis
)
const
;
void
DoUnequalAxisCopy
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
BoxingInfo
&
src_info
,
const
BoxingInfo
&
dst_info
,
bool
need_swap
)
const
;
void
BoxingCopyForUnequalAxis
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
int32_t
concat_axis
,
const
int32_t
split_axis
)
const
;
void
BoxingCopyForEqualAxis
(
const
KernelCtx
&
ctx
,
std
::
vector
<
Blob
*>&
src_blobs
,
std
::
vector
<
Blob
*>&
dst_blobs
,
const
int32_t
axis
)
const
;
void
CopyFromSrcBlobs2DstBlobs
(
const
KernelCtx
&
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
,
const
std
::
vector
<
std
::
string
>&
src_bns
,
const
std
::
vector
<
std
::
string
>&
dst_bns
,
const
int32_t
src_concat_axis
,
const
int32_t
dst_split_axis
)
const
;
void
CopyFromFirstBlob2OtherBlobs
(
const
KernelCtx
&
ctx
,
std
::
function
<
Blob
*
(
const
std
::
string
&
)
>
,
const
std
::
vector
<
std
::
string
>&
obns
)
const
;
};
}
// namespace oneflow
...
...
oneflow/core/kernel/kernel_util.cpp
浏览文件 @
03295996
...
...
@@ -163,21 +163,19 @@ struct KernelUtil<DeviceType::kCPU, T> final {
NormalPersistentInStream
in_stream
(
GlobalFS
(),
file_path
,
begin_pos
);
in_stream
.
Read
(
blob
->
mut_dptr
<
char
>
(),
blob_size
);
}
};
// namespace oneflow
};
#define INSTANTIATE_KERNEL_UTIL(type_cpp, type_proto) \
template
class
KernelUtil<DeviceType::kCPU, type_cpp>;
template
struct
KernelUtil<DeviceType::kCPU, type_cpp>;
OF_PP_FOR_EACH_TUPLE
(
INSTANTIATE_KERNEL_UTIL
,
FLOATING_DATA_TYPE_SEQ
)
#define DEFINE_INT_KERNEL_UTIL(T, type_proto) \
template<> \
struct KernelUtil<DeviceType::kCPU, T> final { \
static void Axpy(DeviceCtx* ctx, const int n, const T alpha, const T* x, \
const int incx, T* y, const int incy) { \
TODO(); \
} \
};
#define DEFINE_INT_KERNEL_UTIL(T, type_proto) \
template<> \
void KernelUtil<DeviceType::kCPU, T>::Axpy( \
DeviceCtx* ctx, const int n, const T alpha, const T* x, const int incx, \
T* y, const int incy) { \
TODO(); \
}
OF_PP_FOR_EACH_TUPLE
(
DEFINE_INT_KERNEL_UTIL
,
INT_DATA_TYPE_SEQ
);
...
...
oneflow/core/kernel/kernel_util.cu
浏览文件 @
03295996
...
...
@@ -156,7 +156,7 @@ struct KernelUtil<DeviceType::kGPU, T> final {
};
#define INSTANTIATE_KERNEL_UTIL(type_cpp, type_proto) \
template
class
KernelUtil<DeviceType::kGPU, type_cpp>;
template
struct
KernelUtil<DeviceType::kGPU, type_cpp>;
OF_PP_FOR_EACH_TUPLE
(
INSTANTIATE_KERNEL_UTIL
,
FLOATING_DATA_TYPE_SEQ
);
#define DEFINE_INT_KERNEL_UTIL(T, type_proto) \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录