Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
0be6ca88
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
0be6ca88
编写于
11月 25, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(src/core): fix record change ptr bug on comp node copy
GitOrigin-RevId: 0f689662113123e00862698269a0ea7aa42af825
上级
84baf3df
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
173 addition
and
28 deletion
+173
-28
src/core/impl/comp_node/cpu/comp_node.cpp
src/core/impl/comp_node/cpu/comp_node.cpp
+86
-0
src/core/impl/graph/var_node_mem_mgr.cpp
src/core/impl/graph/var_node_mem_mgr.cpp
+2
-3
src/core/impl/tensor.cpp
src/core/impl/tensor.cpp
+21
-11
src/core/include/megbrain/comp_node.h
src/core/include/megbrain/comp_node.h
+38
-0
src/gopt/test/network.cpp
src/gopt/test/network.cpp
+4
-0
src/gopt/test/network.h
src/gopt/test/network.h
+1
-0
src/gopt/test/no_memory_copy.cpp
src/gopt/test/no_memory_copy.cpp
+15
-11
src/opr/impl/standalone/nms_opr.cpp
src/opr/impl/standalone/nms_opr.cpp
+6
-3
未找到文件。
src/core/impl/comp_node/cpu/comp_node.cpp
浏览文件 @
0be6ca88
...
...
@@ -306,11 +306,37 @@ public:
m_env
.
cpu_env
().
dispatch
(
do_copy
);
}
void
copy_to_host_ref
(
megdnn
::
RefPtr
&
host_ref_ptr
,
megdnn
::
RefPtr
&
device_ref_ptr
,
size_t
size
)
override
{
// use lambda capture to avoid memory allocation in std::bind
auto
do_copy
=
[
host_ref_ptr
,
device_ref_ptr
,
size
]()
{
std
::
memcpy
(
host_ref_ptr
.
get_ptr
(),
device_ref_ptr
.
get_ptr
(),
size
);
};
m_env
.
cpu_env
().
dispatch
(
do_copy
);
}
void
copy_to_device_ref
(
megdnn
::
RefPtr
&
device_ref_ptr
,
megdnn
::
RefPtr
&
host_ref_ptr
,
size_t
size
)
override
{
// use lambda capture to avoid memory allocation in std::bind
auto
do_copy
=
[
device_ref_ptr
,
host_ref_ptr
,
size
]()
{
std
::
memcpy
(
device_ref_ptr
.
get_ptr
(),
host_ref_ptr
.
get_ptr
(),
size
);
};
m_env
.
cpu_env
().
dispatch
(
do_copy
);
}
void
peer_copy_to
(
Impl
*
dest_impl
,
void
*
dest
,
const
void
*
src
,
size_t
size
)
override
{
dest_impl
->
copy_to_device
(
dest
,
src
,
size
);
}
void
peer_copy_to_ref
(
Impl
*
dest_impl
,
megdnn
::
RefPtr
&
dest
,
megdnn
::
RefPtr
&
src
,
size_t
size
)
override
{
dest_impl
->
copy_to_device_ref
(
dest
,
src
,
size
);
}
size_t
get_mem_addr_alignment
()
override
{
return
m_env
.
property
().
mem_alignment
;
}
void
dispatch
(
Task
&&
task
)
override
{
m_env
.
cpu_env
().
dispatch
(
std
::
move
(
task
));
}
...
...
@@ -733,6 +759,24 @@ public:
CompNodeBaseImpl
::
copy_to_device
(
device_ptr
,
host_ptr
,
size
);
}
void
copy_to_host_ref
(
megdnn
::
RefPtr
&
host_ref_ptr
,
megdnn
::
RefPtr
&
device_ref_ptr
,
size_t
size
)
override
{
if
(
m_worker_queue
)
{
m_worker_queue
->
check_exception
();
}
CompNodeBaseImpl
::
copy_to_host_ref
(
host_ref_ptr
,
device_ref_ptr
,
size
);
}
void
copy_to_device_ref
(
megdnn
::
RefPtr
&
device_ref_ptr
,
megdnn
::
RefPtr
&
host_ref_ptr
,
size_t
size
)
override
{
if
(
m_worker_queue
)
{
m_worker_queue
->
check_exception
();
}
CompNodeBaseImpl
::
copy_to_device_ref
(
device_ref_ptr
,
host_ref_ptr
,
size
);
}
void
peer_copy_to
(
Impl
*
dest_impl
,
void
*
dest
,
const
void
*
src
,
size_t
size
)
override
{
//! copy to default_cpu
...
...
@@ -774,6 +818,48 @@ public:
dest_impl
->
copy_to_device
(
dest
,
src
,
size
);
}
void
peer_copy_to_ref
(
Impl
*
dest_impl
,
megdnn
::
RefPtr
&
dest
,
megdnn
::
RefPtr
&
src
,
size_t
size
)
override
{
//! copy to default_cpu
if
(
dest_impl
->
same_type
<
CpuCompNode
::
CompNodeDefaultImpl
>
())
{
CompNodeBaseImpl
::
peer_copy_to_ref
(
dest_impl
,
dest
,
src
,
size
);
return
;
}
if
(
!
dest_impl
->
same_type
<
CpuCompNode
::
CompNodeRecorderImpl
>
())
{
if
(
dest_impl
->
env
().
property
().
type
==
DeviceType
::
ATLAS
)
{
#if MGB_ATLAS
dest_impl
->
copy_to_device
(
dest
.
get_ptr
(),
src
.
get_ptr
(),
size
);
return
;
#else
mgb_throw
(
MegBrainError
,
"Atlas comp_node used but "
"ATLAS BUILD not enabled"
);
#endif
}
else
if
(
dest_impl
->
env
().
property
().
type
==
DeviceType
::
CAMBRICON
)
{
#if MGB_CAMBRICON
dest_impl
->
copy_to_device
(
dest
.
get_ptr
(),
src
.
get_ptr
(),
size
);
return
;
#else
mgb_throw
(
MegBrainError
,
"Cambricon comp_node used but "
"CAMBRICON BUILD not enabled"
);
#endif
}
else
{
mgb_assert
(
locator
().
device
==
Locator
::
DEVICE_CPU_DEFAULT
,
"currently only peer copy from default cpu comp "
"nodes "
"is implemented"
);
}
}
dest_impl
->
copy_to_device_ref
(
dest
,
src
,
size
);
}
std
::
unique_ptr
<
Event
>
create_event
(
size_t
flags
)
override
{
if
(
m_worker_queue
)
{
m_worker_queue
->
check_exception
();
...
...
src/core/impl/graph/var_node_mem_mgr.cpp
浏览文件 @
0be6ca88
...
...
@@ -81,9 +81,8 @@ const DeviceTensorStorage& StaticDeviceMemoryManager::alloc(
void
StaticDeviceMemoryManager
::
prefault
()
{
for
(
auto
&&
i
:
m_storage
)
{
if
(
i
.
first
.
device_type
()
==
CompNode
::
DeviceType
::
CPU
)
{
auto
set
=
[
ptr
=
i
.
second
.
ptr
(),
size
=
i
.
second
.
size
()]()
{
memset
(
ptr
,
0
,
size
);
};
auto
storage
=
i
.
second
;
auto
set
=
[
storage
]()
{
memset
(
storage
.
ptr
(),
0
,
storage
.
size
());
};
CompNodeEnv
::
from_comp_node
(
i
.
first
).
cpu_env
().
dispatch
(
set
);
i
.
first
.
sync
();
}
...
...
src/core/impl/tensor.cpp
浏览文件 @
0be6ca88
...
...
@@ -379,7 +379,9 @@ MGE_WIN_DECLSPEC_FUC void TensorStorage<HostTensorStorageTrait>::copy_from(
need_sync
=
true
;
}
}
src
.
comp_node
().
copy_to_host
(
ptr
(),
src
.
ptr
(),
size
);
megdnn
::
RefPtr
src_ptr
(
src
.
get_ref_ptr
(),
src
.
offset
(),
false
);
megdnn
::
RefPtr
dst_ptr
(
get_ref_ptr
(),
offset
(),
false
);
src
.
comp_node
().
copy_to_host_ref
(
dst_ptr
,
src_ptr
,
size
);
if
(
need_sync
)
src
.
comp_node
().
sync
();
}
...
...
@@ -390,7 +392,9 @@ template <>
MGE_WIN_DECLSPEC_FUC
void
TensorStorage
<
DeviceTensorStorageTrait
>::
copy_from
(
const
TensorStorage
<
HostTensorStorageTrait
>&
src
,
size_t
size
)
const
{
mgb_assert
(
size
<=
this
->
size
()
&&
size
<=
src
.
size
());
m_comp_node
.
copy_to_device
(
ptr
(),
src
.
ptr
(),
size
);
megdnn
::
RefPtr
src_ptr
(
src
.
get_ref_ptr
(),
src
.
offset
(),
false
);
megdnn
::
RefPtr
dst_ptr
(
get_ref_ptr
(),
offset
(),
false
);
m_comp_node
.
copy_to_device_ref
(
dst_ptr
,
src_ptr
,
size
);
}
// device to device
...
...
@@ -417,9 +421,13 @@ MGE_WIN_DECLSPEC_FUC void TensorStorage<DeviceTensorStorageTrait>::copy_from(
// to pin the memory of src tensor, so it does not require synchronization
// and is more efficient
src
.
comp_node
().
sync
();
comp_node
().
copy_to_device
(
ptr
(),
src
.
ptr
(),
size
);
megdnn
::
RefPtr
src_ptr
(
src
.
get_ref_ptr
(),
src
.
offset
(),
false
);
megdnn
::
RefPtr
dst_ptr
(
get_ref_ptr
(),
offset
(),
false
);
comp_node
().
copy_to_device_ref
(
dst_ptr
,
src_ptr
,
size
);
}
else
{
src
.
comp_node
().
peer_copy_to
(
m_comp_node
,
ptr
(),
src
.
ptr
(),
size
);
megdnn
::
RefPtr
src_ptr
(
src
.
get_ref_ptr
(),
src
.
offset
(),
false
);
megdnn
::
RefPtr
dst_ptr
(
get_ref_ptr
(),
offset
(),
false
);
src
.
comp_node
().
peer_copy_to_ref
(
m_comp_node
,
dst_ptr
,
src_ptr
,
size
);
}
}
...
...
@@ -712,32 +720,34 @@ const typename TensorND<TensorStorage>::ChainReturnType& TensorND<
void
mgb
::
dev_tensor_memset
(
const
DeviceTensorND
&
tensor
,
int
val
)
{
auto
&&
env
=
CompNodeEnv
::
from_comp_node
(
tensor
.
comp_node
());
env
.
activate
();
void
*
ptr
=
tensor
.
raw_ptr
();
size_t
size
=
tensor
.
layout
().
span
().
dist_byte
();
switch
(
env
.
property
().
type
)
{
#if MGB_CUDA
case
CompNode
::
DeviceType
::
CUDA
:
MGB_CUDA_CHECK
(
cudaMemsetAsync
(
ptr
,
val
,
size
,
env
.
cuda_env
().
stream
));
MGB_CUDA_CHECK
(
cudaMemsetAsync
(
tensor
.
raw_ptr
(),
val
,
size
,
env
.
cuda_env
().
stream
));
break
;
#endif
#if MGB_ATLAS
case
CompNode
::
DeviceType
::
ATLAS
:
#if MGB_USE_ATLAS_ASYNC_API
MGB_ATLAS_CHECK
(
aclrtMemsetAsync
(
ptr
,
-
1
,
val
,
size
,
env
.
atlas_env
().
stream
));
MGB_ATLAS_CHECK
(
aclrtMemsetAsync
(
tensor
.
raw_ptr
()
,
-
1
,
val
,
size
,
env
.
atlas_env
().
stream
));
#else
MGB_ATLAS_CHECK
(
aclrtMemset
(
ptr
,
-
1
,
val
,
size
));
MGB_ATLAS_CHECK
(
aclrtMemset
(
tensor
.
raw_ptr
()
,
-
1
,
val
,
size
));
#endif
break
;
#endif
#if MGB_CAMBRICON
case
CompNode
::
DeviceType
::
CAMBRICON
:
MGB_CNRT_CHECK
(
cnrtSyncQueue
(
env
.
cnrt_env
().
queue
));
MGB_CNRT_CHECK
(
cnrtMemset
(
ptr
,
val
,
size
));
MGB_CNRT_CHECK
(
cnrtMemset
(
tensor
.
raw_ptr
()
,
val
,
size
));
break
;
#endif
case
CompNode
::
DeviceType
::
CPU
:
{
auto
fill
=
[
ptr
,
size
,
val
]()
{
std
::
memset
(
ptr
,
val
,
size
);
};
auto
fill
=
[
tensor
,
size
,
val
]()
{
std
::
memset
(
tensor
.
as_megdnn
().
raw_ptr
(),
val
,
size
);
};
env
.
cpu_env
().
dispatch
(
fill
);
}
break
;
default:
...
...
src/core/include/megbrain/comp_node.h
浏览文件 @
0be6ca88
...
...
@@ -242,6 +242,20 @@ public:
return
m_impl
->
copy_to_device
(
device_ptr
,
host_ptr
,
size
);
}
//! copy from underlying device to host
void
copy_to_host_ref
(
megdnn
::
RefPtr
&
host_ref_ptr
,
megdnn
::
RefPtr
&
device_ref_ptr
,
size_t
size
)
const
{
return
m_impl
->
copy_to_host_ref
(
host_ref_ptr
,
device_ref_ptr
,
size
);
}
//! copy from host to underlying device
void
copy_to_device_ref
(
megdnn
::
RefPtr
&
device_ref_ptr
,
megdnn
::
RefPtr
&
host_ref_ptr
,
size_t
size
)
const
{
return
m_impl
->
copy_to_device_ref
(
device_ref_ptr
,
host_ref_ptr
,
size
);
}
/*!
* \brief copy from this device to another device; would use the
* computing resource on dest_node
...
...
@@ -253,6 +267,14 @@ public:
reinterpret_cast
<
Impl
*>
(
dest_node
.
m_impl
),
dest
,
src
,
size
);
}
void
peer_copy_to_ref
(
CompNode
dest_node
,
megdnn
::
RefPtr
&
dst_ref_ptr
,
megdnn
::
RefPtr
&
src_ref_ptr
,
size_t
size
)
const
{
return
m_impl
->
peer_copy_to_ref
(
reinterpret_cast
<
Impl
*>
(
dest_node
.
m_impl
),
dst_ref_ptr
,
src_ref_ptr
,
size
);
}
//! get alignment requiement in bytes; guaranteed to be power of 2
size_t
get_mem_addr_alignment
()
const
{
return
m_impl
->
get_mem_addr_alignment
();
}
...
...
@@ -517,9 +539,25 @@ protected:
void
*
host_ptr
,
const
void
*
device_ptr
,
size_t
size
)
=
0
;
virtual
void
copy_to_device
(
void
*
device_ptr
,
const
void
*
host_ptr
,
size_t
size
)
=
0
;
virtual
void
copy_to_host_ref
(
megdnn
::
RefPtr
&
host_ref_ptr
,
megdnn
::
RefPtr
&
device_ref_ptr
,
size_t
size
)
{
copy_to_host
(
host_ref_ptr
.
get_ptr
(),
device_ref_ptr
.
get_ptr
(),
size
);
}
virtual
void
copy_to_device_ref
(
megdnn
::
RefPtr
&
device_ref_ptr
,
megdnn
::
RefPtr
&
host_ref_ptr
,
size_t
size
)
{
copy_to_device
(
device_ref_ptr
.
get_ptr
(),
host_ref_ptr
.
get_ptr
(),
size
);
}
virtual
void
peer_copy_to
(
Impl
*
dest_impl
,
void
*
dest
,
const
void
*
src
,
size_t
size
)
=
0
;
virtual
void
peer_copy_to_ref
(
Impl
*
dest_impl
,
megdnn
::
RefPtr
&
dest
,
megdnn
::
RefPtr
&
src
,
size_t
size
)
{
peer_copy_to
(
dest_impl
,
dest
.
get_ptr
(),
src
.
get_ptr
(),
size
);
}
virtual
size_t
get_mem_addr_alignment
()
=
0
;
virtual
size_t
get_mem_padding
();
...
...
src/gopt/test/network.cpp
浏览文件 @
0be6ca88
...
...
@@ -100,6 +100,10 @@ SymbolVar Network::add_type_cvt(SymbolVar f, DType out_dtype) {
return
opr
::
TypeCvt
::
make
(
f
,
out_dtype
);
}
SymbolVar
Network
::
add_concat
(
SymbolVar
f
,
SymbolVar
g
,
int
axis
)
{
return
opr
::
Concat
::
make
({
f
,
g
},
axis
);
}
SymbolVar
mgb
::
create_block
(
Network
&
network
,
SymbolVar
f_in
,
size_t
stride
,
size_t
num_outputs1
,
bool
has_proj
,
DType
out_dtype
)
{
...
...
src/gopt/test/network.h
浏览文件 @
0be6ca88
...
...
@@ -60,6 +60,7 @@ public:
Padding
padding
=
{
0
,
0
},
opr
::
Pooling
::
Param
::
Mode
mode
=
opr
::
Pooling
::
Param
::
Mode
::
MAX
);
SymbolVar
add_type_cvt
(
SymbolVar
f
,
DType
out_dtype
=
dtype
::
Float32
());
SymbolVar
add_concat
(
SymbolVar
f
,
SymbolVar
g
,
int
axis
=
0
);
};
SymbolVar
create_block
(
...
...
src/gopt/test/no_memory_copy.cpp
浏览文件 @
0be6ca88
...
...
@@ -41,7 +41,8 @@ struct TestGraph {
f
=
m_network
->
add_elemwise
(
{
f
},
dtype
::
Float32
(),
opr
::
Elemwise
::
Param
::
Mode
::
EXP
);
f
=
m_network
->
add_conv
(
f
,
8
,
{
3
,
3
},
dtype
::
Float32
(),
true
,
{
1
,
1
},
{
1
,
1
});
m_out_var
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
f
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
m_out_var
=
m_network
->
add_concat
(
f
,
-
f
);
}
void
create_graph_with_subtensor_forward
()
{
...
...
@@ -63,7 +64,8 @@ struct TestGraph {
f
=
m_network
->
add_elemwise
(
{
f
},
dtype
::
Float32
(),
opr
::
Elemwise
::
Param
::
Mode
::
EXP
);
f
=
m_network
->
add_conv
(
f
,
8
,
{
3
,
3
},
dtype
::
Float32
(),
true
,
{
1
,
1
},
{
1
,
1
});
m_out_var
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
f
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
m_out_var
=
m_network
->
add_concat
(
f
,
-
f
);
}
void
create_graph_with_subtensor_relayout
()
{
...
...
@@ -86,7 +88,8 @@ struct TestGraph {
f
=
m_network
->
add_elemwise
(
{
f
},
dtype
::
Float32
(),
opr
::
Elemwise
::
Param
::
Mode
::
EXP
);
f
=
m_network
->
add_conv
(
f
,
8
,
{
3
,
3
},
dtype
::
Float32
(),
true
,
{
1
,
1
},
{
1
,
1
});
m_out_var
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
f
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
m_out_var
=
m_network
->
add_concat
(
f
,
-
f
);
}
void
create_graph_with_setsubtensor
()
{
...
...
@@ -113,7 +116,8 @@ struct TestGraph {
f
=
m_network
->
add_elemwise
(
{
f
},
dtype
::
Float32
(),
opr
::
Elemwise
::
Param
::
Mode
::
EXP
);
f
=
m_network
->
add_conv
(
f
,
8
,
{
3
,
3
},
dtype
::
Float32
(),
true
,
{
1
,
1
},
{
1
,
1
});
m_out_var
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
f
=
m_network
->
add_pooling
(
f
,
{
2
,
2
},
{
2
,
2
});
m_out_var
=
m_network
->
add_concat
(
f
,
-
f
);
}
std
::
unique_ptr
<
cg
::
AsyncExecutable
>
compile_without_copy
()
{
...
...
@@ -173,8 +177,8 @@ TEST(TestNoCopy, IONoCopyPtrEQ) {
test_graph
.
create_graph
();
auto
func
=
test_graph
.
compile_without_copy
();
auto
&&
outvar
=
func
->
get_output_vars
()[
0
];
DeviceTensorND
dv0
(
test_graph
.
m_cn
,
{
1
,
8
,
7
,
7
});
DeviceTensorND
dv1
(
test_graph
.
m_cn
,
{
1
,
8
,
7
,
7
});
DeviceTensorND
dv0
(
test_graph
.
m_cn
,
{
2
,
8
,
7
,
7
});
DeviceTensorND
dv1
(
test_graph
.
m_cn
,
{
2
,
8
,
7
,
7
});
size_t
times
=
10
;
for
(
size_t
i
=
0
;
i
<
times
;
i
++
)
{
auto
input_tensor
=
test_graph
.
input_tensor
;
...
...
@@ -229,7 +233,7 @@ TEST(TestNoCopy, IONoCopyCorrect) {
ptr
[
d
]
=
i
/
5
+
3
;
}
input_tensor
->
reset
(
storage
,
layout
);
DeviceTensorND
dv
(
test_graph
.
m_cn
,
{
1
,
8
,
7
,
7
});
DeviceTensorND
dv
(
test_graph
.
m_cn
,
{
2
,
8
,
7
,
7
});
outvar
->
init_mem_plan
(
&
dv
);
outvar
->
reset_dev_tensor_from_tensor
(
dv
);
...
...
@@ -258,7 +262,7 @@ TEST(TestNoCopy, IONoCopyRecord) {
HostTensorND
truth
;
auto
func
=
test_graph
.
compile_without_copy
();
auto
&&
outvar
=
func
->
get_output_vars
()[
0
];
DeviceTensorND
tmp
(
test_graph
.
m_cn
,
{
1
,
8
,
7
,
7
});
DeviceTensorND
tmp
(
test_graph
.
m_cn
,
{
2
,
8
,
7
,
7
});
outvar
->
init_mem_plan
(
&
tmp
);
size_t
times
=
10
;
for
(
size_t
i
=
0
;
i
<
times
;
i
++
)
{
...
...
@@ -272,7 +276,7 @@ TEST(TestNoCopy, IONoCopyRecord) {
ptr
[
d
]
=
i
/
5
+
3
;
}
input_tensor
->
only_reset_raw_storage
(
storage
);
DeviceTensorND
dv
(
test_graph
.
m_cn
,
{
1
,
8
,
7
,
7
});
DeviceTensorND
dv
(
test_graph
.
m_cn
,
{
2
,
8
,
7
,
7
});
dv
.
raw_ptr
();
auto
&
dev_tensor
=
outvar
->
mutable_dev_tensor
();
...
...
@@ -306,7 +310,7 @@ void test_subtensor_record(int level) {
HostTensorND
truth
;
auto
func
=
test_graph
.
compile_without_copy
();
auto
&&
outvar
=
func
->
get_output_vars
()[
0
];
DeviceTensorND
tmp
(
test_graph
.
m_cn
,
{
1
,
8
,
7
,
7
});
DeviceTensorND
tmp
(
test_graph
.
m_cn
,
{
2
,
8
,
7
,
7
});
outvar
->
init_mem_plan
(
&
tmp
);
size_t
times
=
10
;
for
(
size_t
i
=
0
;
i
<
times
;
i
++
)
{
...
...
@@ -320,7 +324,7 @@ void test_subtensor_record(int level) {
ptr
[
d
]
=
i
/
5
+
3
;
}
input_tensor
->
only_reset_raw_storage
(
storage
);
DeviceTensorND
dv
(
test_graph
.
m_cn
,
{
1
,
8
,
7
,
7
});
DeviceTensorND
dv
(
test_graph
.
m_cn
,
{
2
,
8
,
7
,
7
});
dv
.
raw_ptr
();
auto
&
dev_tensor
=
outvar
->
mutable_dev_tensor
();
...
...
src/opr/impl/standalone/nms_opr.cpp
浏览文件 @
0be6ca88
...
...
@@ -139,11 +139,9 @@ void NMSKeep::CPUKern::exec(
// See CUDAKern::exec for more explanation on output comp nodes.
CompNode
comp_node
=
out_idx
.
comp_node
();
auto
inp_ptr
=
inp
.
ptr
<
float
>
();
auto
out_idx_ptr
=
reinterpret_cast
<
uint32_t
*>
(
out_idx
.
ptr
<
int32_t
>
()),
out_size_ptr
=
reinterpret_cast
<
uint32_t
*>
(
out_size
.
ptr
<
int32_t
>
());
size_t
batch
=
inp
.
shape
(
0
),
nr_boxes
=
inp
.
shape
(
1
);
if
(
nr_boxes
==
0
)
{
auto
out_size_ptr
=
reinterpret_cast
<
uint32_t
*>
(
out_size
.
ptr
<
int32_t
>
());
for
(
size_t
i
=
0
;
i
<
batch
;
++
i
)
{
*
(
out_size_ptr
+
i
)
=
0
;
}
...
...
@@ -157,6 +155,11 @@ void NMSKeep::CPUKern::exec(
// be dispatched on a different thread
auto
kern
=
[
=
]()
{
for
(
size_t
i
=
0
;
i
<
batch
;
++
i
)
{
auto
inp_ptr
=
inp
.
as_megdnn
().
ptr
<
float
>
();
auto
out_idx_ptr
=
reinterpret_cast
<
uint32_t
*>
(
out_idx
.
as_megdnn
().
ptr
<
int32_t
>
());
auto
out_size_ptr
=
reinterpret_cast
<
uint32_t
*>
(
out_size
.
as_megdnn
().
ptr
<
int32_t
>
());
nms
::
cpu_kern
(
nr_boxes
,
param
.
max_output
,
param
.
iou_thresh
,
inp_ptr
+
i
*
nr_boxes
*
4
,
out_idx_ptr
+
i
*
param
.
max_output
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录