Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
751a7942
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
751a7942
编写于
8月 25, 2021
作者:
L
liutiexing
提交者:
GitHub
8月 25, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
high-performance SingleThreadedWorkQueue (#35086)
上级
ff96a7d5
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
1230 addition
and
0 deletion
+1230
-0
paddle/fluid/framework/new_executor/CMakeLists.txt
paddle/fluid/framework/new_executor/CMakeLists.txt
+2
-0
paddle/fluid/framework/new_executor/event_count.h
paddle/fluid/framework/new_executor/event_count.h
+272
-0
paddle/fluid/framework/new_executor/nonblocking_threadpool.h
paddle/fluid/framework/new_executor/nonblocking_threadpool.h
+516
-0
paddle/fluid/framework/new_executor/run_queue.h
paddle/fluid/framework/new_executor/run_queue.h
+267
-0
paddle/fluid/framework/new_executor/thread_environment.h
paddle/fluid/framework/new_executor/thread_environment.h
+42
-0
paddle/fluid/framework/new_executor/workqueue.cc
paddle/fluid/framework/new_executor/workqueue.cc
+41
-0
paddle/fluid/framework/new_executor/workqueue.h
paddle/fluid/framework/new_executor/workqueue.h
+45
-0
paddle/fluid/framework/new_executor/workqueue_test.cc
paddle/fluid/framework/new_executor/workqueue_test.cc
+45
-0
未找到文件。
paddle/fluid/framework/new_executor/CMakeLists.txt
浏览文件 @
751a7942
cc_library
(
interpretercore SRCS interpretercore.cc DEPS operator op_registry executor
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
${
PYBIND_DEPS
}
profiler
)
cc_library
(
standalone_executor SRCS standalone_executor.cc DEPS interpretercore operator op_registry executor
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
${
PYBIND_DEPS
}
profiler
)
cc_library
(
workqueue SRCS workqueue.cc
)
cc_test
(
workqueue_test SRCS workqueue_test.cc DEPS workqueue
)
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
paddle/fluid/framework/new_executor/event_count.h
0 → 100644
浏览文件 @
751a7942
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// EventCount allows to wait for arbitrary predicates in non-blocking
// algorithms. Think of condition variable, but wait predicate does not need to
// be protected by a mutex. Usage:
// Waiting thread does:
//
// if (predicate)
// return act();
// EventCount::Waiter& w = waiters[my_index];
// ec.Prewait(&w);
// if (predicate) {
// ec.CancelWait(&w);
// return act();
// }
// ec.CommitWait(&w);
//
// Notifying thread does:
//
// predicate = true;
// ec.Notify(true);
//
// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
// cheap, but they are executed only if the preceding predicate check has
// failed.
//
// Algorithm outline:
// There are two main variables: predicate (managed by user) and state_.
// Operation closely resembles Dekker mutual algorithm:
// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
// Waiting thread sets state_ then checks predicate, Notifying thread sets
// predicate then checks state_. Due to seq_cst fences in between these
// operations it is guaranteed than either waiter will see predicate change
// and won't block, or notifying thread will see state_ change and will unblock
// the waiter, or both. But it can't happen that both threads don't see each
// other changes, which would lead to deadlock.
#pragma once
#include <atomic>
#include <cassert>
#include <condition_variable>
#include <cstdlib>
#include <mutex>
#include <vector>
namespace
paddle
{
namespace
framework
{
class
EventCount
{
public:
class
Waiter
;
explicit
EventCount
(
size_t
waiter_num
)
:
state_
(
kStackMask
)
{
assert
(
waiter_num
<
(
1
<<
kWaiterBits
)
-
1
);
void
*
buffer
=
malloc
(
sizeof
(
Waiter
)
*
waiter_num
);
if
(
buffer
==
nullptr
)
{
return
;
}
waiters_
=
reinterpret_cast
<
Waiter
*>
(
buffer
);
waiter_num_
=
waiter_num
;
for
(
size_t
i
=
0
;
i
<
waiter_num_
;
++
i
)
{
new
(
&
waiters_
[
i
])
Waiter
;
}
}
EventCount
(
const
EventCount
&
)
=
delete
;
void
operator
=
(
const
EventCount
&
)
=
delete
;
~
EventCount
()
{
// Ensure there are no waiters.
assert
(
state_
.
load
()
==
kStackMask
);
free
(
waiters_
);
}
Waiter
*
GetWaiter
(
size_t
waiter_index
)
{
assert
(
waiter_index
<
waiter_num_
);
return
&
waiters_
[
waiter_index
];
}
// Prewait prepares for waiting.
// After calling Prewait, the thread must re-check the wait predicate
// and then call either CancelWait or CommitWait.
void
Prewait
()
{
uint64_t
state
=
state_
.
load
(
std
::
memory_order_relaxed
);
for
(;;)
{
CheckState
(
state
);
uint64_t
newstate
=
state
+
kWaiterInc
;
CheckState
(
newstate
);
if
(
state_
.
compare_exchange_weak
(
state
,
newstate
,
std
::
memory_order_seq_cst
))
return
;
}
}
// CommitWait commits waiting after Prewait.
void
CommitWait
(
Waiter
*
w
)
{
assert
((
w
->
epoch
&
~
kEpochMask
)
==
0
);
w
->
state
=
Waiter
::
kNotSignaled
;
const
uint64_t
me
=
(
w
-
&
waiters_
[
0
])
|
w
->
epoch
;
uint64_t
state
=
state_
.
load
(
std
::
memory_order_seq_cst
);
for
(;;)
{
CheckState
(
state
,
true
);
uint64_t
newstate
;
if
((
state
&
kSignalMask
)
!=
0
)
{
// Consume the signal and return immidiately.
newstate
=
state
-
kWaiterInc
-
kSignalInc
;
}
else
{
// Remove this thread from pre-wait counter and add to the waiter stack.
newstate
=
((
state
&
kWaiterMask
)
-
kWaiterInc
)
|
me
;
w
->
next
.
store
(
state
&
(
kStackMask
|
kEpochMask
),
std
::
memory_order_relaxed
);
}
CheckState
(
newstate
);
if
(
state_
.
compare_exchange_weak
(
state
,
newstate
,
std
::
memory_order_acq_rel
))
{
if
((
state
&
kSignalMask
)
==
0
)
{
w
->
epoch
+=
kEpochInc
;
Park
(
w
);
}
return
;
}
}
}
// CancelWait cancels effects of the previous Prewait call.
void
CancelWait
()
{
uint64_t
state
=
state_
.
load
(
std
::
memory_order_relaxed
);
for
(;;)
{
CheckState
(
state
,
true
);
uint64_t
newstate
=
state
-
kWaiterInc
;
// We don't know if the thread was also notified or not,
// so we should not consume a signal unconditionaly.
// Only if number of waiters is equal to number of signals,
// we know that the thread was notified and we must take away the signal.
if
(((
state
&
kWaiterMask
)
>>
kWaiterShift
)
==
((
state
&
kSignalMask
)
>>
kSignalShift
))
newstate
-=
kSignalInc
;
CheckState
(
newstate
);
if
(
state_
.
compare_exchange_weak
(
state
,
newstate
,
std
::
memory_order_acq_rel
))
return
;
}
}
// Notify wakes one or all waiting threads.
// Must be called after changing the associated wait predicate.
void
Notify
(
bool
notify_all
)
{
std
::
atomic_thread_fence
(
std
::
memory_order_seq_cst
);
uint64_t
state
=
state_
.
load
(
std
::
memory_order_acquire
);
for
(;;)
{
CheckState
(
state
);
const
uint64_t
waiters
=
(
state
&
kWaiterMask
)
>>
kWaiterShift
;
const
uint64_t
signals
=
(
state
&
kSignalMask
)
>>
kSignalShift
;
// Easy case: no waiters.
if
((
state
&
kStackMask
)
==
kStackMask
&&
waiters
==
signals
)
return
;
uint64_t
newstate
;
if
(
notify_all
)
{
// Empty wait stack and set signal to number of pre-wait threads.
newstate
=
(
state
&
kWaiterMask
)
|
(
waiters
<<
kSignalShift
)
|
kStackMask
;
}
else
if
(
signals
<
waiters
)
{
// There is a thread in pre-wait state, unblock it.
newstate
=
state
+
kSignalInc
;
}
else
{
// Pop a waiter from list and unpark it.
Waiter
*
w
=
&
waiters_
[
state
&
kStackMask
];
uint64_t
next
=
w
->
next
.
load
(
std
::
memory_order_relaxed
);
newstate
=
(
state
&
(
kWaiterMask
|
kSignalMask
))
|
next
;
}
CheckState
(
newstate
);
if
(
state_
.
compare_exchange_weak
(
state
,
newstate
,
std
::
memory_order_acq_rel
))
{
if
(
!
notify_all
&&
(
signals
<
waiters
))
return
;
// unblocked pre-wait thread
if
((
state
&
kStackMask
)
==
kStackMask
)
return
;
Waiter
*
w
=
&
waiters_
[
state
&
kStackMask
];
if
(
!
notify_all
)
w
->
next
.
store
(
kStackMask
,
std
::
memory_order_relaxed
);
Unpark
(
w
);
return
;
}
}
}
class
Waiter
{
friend
class
EventCount
;
// Align to 128 byte boundary to prevent false sharing with other Waiter
// objects in the same vector.
alignas
(
128
)
std
::
atomic
<
uint64_t
>
next
;
std
::
mutex
mu
;
std
::
condition_variable
cv
;
uint64_t
epoch
=
0
;
unsigned
state
=
kNotSignaled
;
enum
{
kNotSignaled
,
kWaiting
,
kSignaled
,
};
};
private:
// State_ layout:
// - low kWaiterBits is a stack of waiters committed wait
// (indexes in waiters_ array are used as stack elements,
// kStackMask means empty stack).
// - next kWaiterBits is count of waiters in prewait state.
// - next kWaiterBits is count of pending signals.
// - remaining bits are ABA counter for the stack.
// (stored in Waiter node and incremented on push).
static
const
uint64_t
kWaiterBits
=
14
;
static
const
uint64_t
kStackMask
=
(
1ull
<<
kWaiterBits
)
-
1
;
static
const
uint64_t
kWaiterShift
=
kWaiterBits
;
static
const
uint64_t
kWaiterMask
=
((
1ull
<<
kWaiterBits
)
-
1
)
<<
kWaiterShift
;
static
const
uint64_t
kWaiterInc
=
1ull
<<
kWaiterShift
;
static
const
uint64_t
kSignalShift
=
2
*
kWaiterBits
;
static
const
uint64_t
kSignalMask
=
((
1ull
<<
kWaiterBits
)
-
1
)
<<
kSignalShift
;
static
const
uint64_t
kSignalInc
=
1ull
<<
kSignalShift
;
static
const
uint64_t
kEpochShift
=
3
*
kWaiterBits
;
static
const
uint64_t
kEpochBits
=
64
-
kEpochShift
;
static
const
uint64_t
kEpochMask
=
((
1ull
<<
kEpochBits
)
-
1
)
<<
kEpochShift
;
static
const
uint64_t
kEpochInc
=
1ull
<<
kEpochShift
;
std
::
atomic
<
uint64_t
>
state_
;
Waiter
*
waiters_
{
nullptr
};
size_t
waiter_num_
{
0
};
static
void
CheckState
(
uint64_t
state
,
bool
waiter
=
false
)
{
static_assert
(
kEpochBits
>=
20
,
"not enough bits to prevent ABA problem"
);
const
uint64_t
waiters
=
(
state
&
kWaiterMask
)
>>
kWaiterShift
;
const
uint64_t
signals
=
(
state
&
kSignalMask
)
>>
kSignalShift
;
assert
(
waiters
>=
signals
);
assert
(
waiters
<
(
1
<<
kWaiterBits
)
-
1
);
assert
(
!
waiter
||
waiters
>
0
);
(
void
)
waiters
;
(
void
)
signals
;
}
void
Park
(
Waiter
*
w
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
w
->
mu
);
while
(
w
->
state
!=
Waiter
::
kSignaled
)
{
w
->
state
=
Waiter
::
kWaiting
;
w
->
cv
.
wait
(
lock
);
}
}
void
Unpark
(
Waiter
*
w
)
{
for
(
Waiter
*
next
;
w
;
w
=
next
)
{
uint64_t
wnext
=
w
->
next
.
load
(
std
::
memory_order_relaxed
)
&
kStackMask
;
next
=
wnext
==
kStackMask
?
nullptr
:
&
waiters_
[
wnext
];
unsigned
state
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
w
->
mu
);
state
=
w
->
state
;
w
->
state
=
Waiter
::
kSignaled
;
}
// Avoid notifying if it wasn't waiting.
if
(
state
==
Waiter
::
kWaiting
)
w
->
cv
.
notify_one
();
}
}
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/new_executor/nonblocking_threadpool.h
0 → 100644
浏览文件 @
751a7942
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#pragma once
#include <atomic>
#include <cstdlib>
#include <vector>
#include "paddle/fluid/framework/new_executor/event_count.h"
#include "paddle/fluid/framework/new_executor/run_queue.h"
#include "paddle/fluid/framework/new_executor/thread_environment.h"
namespace
paddle
{
namespace
framework
{
class
CounterTracker
{
public:
explicit
CounterTracker
(
std
::
atomic
<
uint64_t
>*
counter
,
EventCount
*
ec
)
:
counter_
(
counter
),
ec_
(
ec
)
{
counter_
->
fetch_add
(
1
,
std
::
memory_order_relaxed
);
}
~
CounterTracker
()
{
if
(
counter_
!=
nullptr
)
{
if
(
1
==
counter_
->
fetch_sub
(
1
,
std
::
memory_order_relaxed
))
{
ec_
->
Notify
(
true
);
}
}
}
CounterTracker
(
CounterTracker
&&
other
)
:
counter_
(
other
.
counter_
),
ec_
(
other
.
ec_
)
{
other
.
counter_
=
nullptr
;
other
.
ec_
=
nullptr
;
}
CounterTracker
&
operator
=
(
CounterTracker
&&
other
)
{
counter_
=
other
.
counter_
;
ec_
=
other
.
ec_
;
other
.
counter_
=
nullptr
;
other
.
ec_
=
nullptr
;
return
*
this
;
}
CounterTracker
(
const
CounterTracker
&
other
)
:
counter_
(
other
.
counter_
),
ec_
(
other
.
ec_
)
{
counter_
->
fetch_add
(
1
,
std
::
memory_order_relaxed
);
}
CounterTracker
&
operator
=
(
const
CounterTracker
&
)
=
delete
;
private:
std
::
atomic
<
uint64_t
>*
counter_
{
nullptr
};
EventCount
*
ec_
{
nullptr
};
};
template
<
typename
Environment
>
class
ThreadPoolTempl
{
public:
typedef
typename
Environment
::
Task
Task
;
typedef
RunQueue
<
Task
,
1024
>
Queue
;
explicit
ThreadPoolTempl
(
int
num_threads
,
Environment
env
=
Environment
())
:
ThreadPoolTempl
(
num_threads
,
true
,
env
)
{}
ThreadPoolTempl
(
int
num_threads
,
bool
allow_spinning
,
Environment
env
=
Environment
())
:
env_
(
env
),
num_threads_
(
num_threads
),
allow_spinning_
(
allow_spinning
),
thread_data_
(
num_threads
),
global_steal_partition_
(
EncodePartition
(
0
,
num_threads_
)),
blocked_
(
0
),
spinning_
(
0
),
done_
(
false
),
cancelled_
(
false
),
ec_
(
num_threads_
),
wait_empty_
(
false
),
wait_empty_ec_
(
1
),
num_tasks_
(
0
)
{
// Calculate coprimes of all numbers [1, num_threads].
// Coprimes are used for random walks over all threads in Steal
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
// a random starting thread index t and calculate num_threads - 1 subsequent
// indices as (t + coprime) % num_threads, we will cover all threads without
// repetitions (effectively getting a presudo-random permutation of thread
// indices).
assert
(
num_threads_
>=
1
&&
num_threads_
<
kMaxThreads
);
all_coprimes_
.
reserve
(
num_threads_
);
for
(
int
i
=
1
;
i
<=
num_threads_
;
++
i
)
{
all_coprimes_
.
emplace_back
();
all_coprimes_
.
back
().
push_back
(
i
);
ComputeCoprimes
(
i
,
&
(
all_coprimes_
.
back
()));
}
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
SetStealPartition
(
i
,
EncodePartition
(
0
,
num_threads_
));
thread_data_
[
i
].
thread
.
reset
(
env_
.
CreateThread
([
this
,
i
]()
{
WorkerLoop
(
i
);
}));
}
}
~
ThreadPoolTempl
()
{
done_
=
true
;
// Now if all threads block without work, they will start exiting.
// But note that threads can continue to work arbitrary long,
// block, submit new work, unblock and otherwise live full life.
if
(
!
cancelled_
)
{
ec_
.
Notify
(
true
);
}
else
{
// Since we were cancelled, there might be entries in the queues.
// Empty them to prevent their destructor from asserting.
for
(
size_t
i
=
0
;
i
<
thread_data_
.
size
();
i
++
)
{
thread_data_
[
i
].
queue
.
Flush
();
}
}
// Join threads explicitly (by destroying) to avoid destruction order within
// this class.
for
(
size_t
i
=
0
;
i
<
thread_data_
.
size
();
++
i
)
{
thread_data_
[
i
].
thread
.
reset
();
}
}
void
SetStealPartitions
(
const
std
::
vector
<
std
::
pair
<
unsigned
,
unsigned
>>&
partitions
)
{
assert
(
partitions
.
size
()
==
static_cast
<
std
::
size_t
>
(
num_threads_
));
// Pass this information to each thread queue.
for
(
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
const
auto
&
pair
=
partitions
[
i
];
unsigned
start
=
pair
.
first
,
end
=
pair
.
second
;
AssertBounds
(
start
,
end
);
unsigned
val
=
EncodePartition
(
start
,
end
);
SetStealPartition
(
i
,
val
);
}
}
void
AddTask
(
std
::
function
<
void
()
>
fn
)
{
AddTaskWithHint
(
std
::
move
(
fn
),
0
,
num_threads_
);
}
void
AddTaskWithHint
(
std
::
function
<
void
()
>
fn
,
int
start
,
int
limit
)
{
Task
t
=
env_
.
CreateTask
([
task
=
std
::
move
(
fn
),
raii
=
CounterTracker
(
&
num_tasks_
,
&
wait_empty_ec_
)
]()
mutable
{
task
();
});
PerThread
*
pt
=
GetPerThread
();
if
(
pt
->
pool
==
this
)
{
// Worker thread of this pool, push onto the thread's queue.
Queue
&
q
=
thread_data_
[
pt
->
thread_id
].
queue
;
t
=
q
.
PushFront
(
std
::
move
(
t
));
}
else
if
(
wait_empty_
.
load
()
==
false
)
{
// A free-standing thread (or worker of another pool), push onto a random
// queue.
assert
(
start
<
limit
);
assert
(
limit
<=
num_threads_
);
int
num_queues
=
limit
-
start
;
int
rnd
=
Rand
(
&
pt
->
rand
)
%
num_queues
;
assert
(
start
+
rnd
<
limit
);
Queue
&
q
=
thread_data_
[
start
+
rnd
].
queue
;
t
=
q
.
PushBack
(
std
::
move
(
t
));
}
// Note: below we touch this after making w available to worker threads.
// Strictly speaking, this can lead to a racy-use-after-free. Consider that
// Schedule is called from a thread that is neither main thread nor a worker
// thread of this pool. Then, execution of w directly or indirectly
// completes overall computations, which in turn leads to destruction of
// this. We expect that such scenario is prevented by program, that is,
// this is kept alive while any threads can potentially be in Schedule.
if
(
!
t
.
f
)
{
ec_
.
Notify
(
false
);
}
else
{
env_
.
ExecuteTask
(
t
);
// Push failed, execute directly.
}
}
void
WaitQueueEmpty
()
{
bool
waiting
=
wait_empty_
.
load
();
assert
(
waiting
==
false
);
if
(
waiting
||
!
wait_empty_
.
compare_exchange_strong
(
waiting
,
true
,
std
::
memory_order_acquire
))
{
abort
();
}
EventCount
::
Waiter
*
w
=
wait_empty_ec_
.
GetWaiter
(
0
);
wait_empty_ec_
.
Prewait
();
if
(
num_tasks_
.
load
()
==
0
)
{
wait_empty_ec_
.
CancelWait
();
}
else
{
wait_empty_ec_
.
CommitWait
(
w
);
}
waiting
=
true
;
if
(
!
waiting
||
!
wait_empty_
.
compare_exchange_strong
(
waiting
,
false
,
std
::
memory_order_acquire
))
{
abort
();
}
}
void
Cancel
()
{
cancelled_
=
true
;
done_
=
true
;
// Wake up the threads without work to let them exit on their own.
ec_
.
Notify
(
true
);
}
size_t
NumThreads
()
const
{
return
num_threads_
;
}
int
CurrentThreadId
()
const
{
const
PerThread
*
pt
=
const_cast
<
ThreadPoolTempl
*>
(
this
)
->
GetPerThread
();
if
(
pt
->
pool
==
this
)
{
return
pt
->
thread_id
;
}
else
{
return
-
1
;
}
}
private:
// Create a single atomic<int> that encodes start and limit information for
// each thread.
// We expect num_threads_ < 65536, so we can store them in a single
// std::atomic<unsigned>.
// Exposed publicly as static functions so that external callers can reuse
// this encode/decode logic for maintaining their own thread-safe copies of
// scheduling and steal domain(s).
static
const
int
kMaxPartitionBits
=
16
;
static
const
int
kMaxThreads
=
1
<<
kMaxPartitionBits
;
inline
unsigned
EncodePartition
(
unsigned
start
,
unsigned
limit
)
{
return
(
start
<<
kMaxPartitionBits
)
|
limit
;
}
inline
void
DecodePartition
(
unsigned
val
,
unsigned
*
start
,
unsigned
*
limit
)
{
*
limit
=
val
&
(
kMaxThreads
-
1
);
val
>>=
kMaxPartitionBits
;
*
start
=
val
;
}
void
AssertBounds
(
int
start
,
int
end
)
{
assert
(
start
>=
0
);
assert
(
start
<
end
);
// non-zero sized partition
assert
(
end
<=
num_threads_
);
}
inline
void
SetStealPartition
(
size_t
i
,
unsigned
val
)
{
thread_data_
[
i
].
steal_partition
.
store
(
val
,
std
::
memory_order_relaxed
);
}
inline
unsigned
GetStealPartition
(
int
i
)
{
return
thread_data_
[
i
].
steal_partition
.
load
(
std
::
memory_order_relaxed
);
}
inline
void
ComputeCoprimes
(
int
n
,
std
::
vector
<
unsigned
>*
coprimes
)
{
for
(
int
i
=
1
;
i
<=
n
;
i
++
)
{
unsigned
a
=
i
;
unsigned
b
=
n
;
// If GCD(a, b) == 1, then a and b are coprimes.
while
(
b
!=
0
)
{
unsigned
tmp
=
a
;
a
=
b
;
b
=
tmp
%
b
;
}
if
(
a
==
1
)
{
coprimes
->
push_back
(
i
);
}
}
}
typedef
typename
Environment
::
EnvThread
Thread
;
struct
PerThread
{
constexpr
PerThread
()
:
pool
(
NULL
),
rand
(
0
),
thread_id
(
-
1
)
{}
ThreadPoolTempl
*
pool
;
// Parent pool, or null for normal threads.
uint64_t
rand
;
// Random generator state.
int
thread_id
;
// Worker thread index in pool.
};
struct
ThreadData
{
constexpr
ThreadData
()
:
thread
(),
steal_partition
(
0
),
queue
()
{}
std
::
unique_ptr
<
Thread
>
thread
;
std
::
atomic
<
unsigned
>
steal_partition
;
Queue
queue
;
};
Environment
env_
;
const
int
num_threads_
;
const
bool
allow_spinning_
;
std
::
vector
<
ThreadData
>
thread_data_
;
std
::
vector
<
std
::
vector
<
unsigned
>>
all_coprimes_
;
unsigned
global_steal_partition_
;
std
::
atomic
<
unsigned
>
blocked_
;
std
::
atomic
<
bool
>
spinning_
;
std
::
atomic
<
bool
>
done_
;
std
::
atomic
<
bool
>
cancelled_
;
EventCount
ec_
;
std
::
atomic
<
bool
>
wait_empty_
;
EventCount
wait_empty_ec_
;
std
::
atomic
<
uint64_t
>
num_tasks_
;
// Main worker thread loop.
void
WorkerLoop
(
int
thread_id
)
{
PerThread
*
pt
=
GetPerThread
();
pt
->
pool
=
this
;
pt
->
rand
=
GlobalThreadIdHash
();
pt
->
thread_id
=
thread_id
;
Queue
&
q
=
thread_data_
[
thread_id
].
queue
;
EventCount
::
Waiter
*
waiter
=
ec_
.
GetWaiter
(
thread_id
);
// TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
// proportional to num_threads_ and we assume that new work is scheduled at
// a constant rate, so we set spin_count to 5000 / num_threads_. The
// constant was picked based on a fair dice roll, tune it.
const
int
spin_count
=
allow_spinning_
&&
num_threads_
>
0
?
5000
/
num_threads_
:
0
;
if
(
num_threads_
==
1
)
{
// For num_threads_ == 1 there is no point in going through the expensive
// steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
// victim queues it might reverse the order in which ops are executed
// compared to the order in which they are added, which tends to be
// counter-productive for the types of I/O workloads the single thread
// pools tend to be used for.
while
(
!
cancelled_
)
{
Task
t
=
q
.
PopFront
();
for
(
int
i
=
0
;
i
<
spin_count
&&
!
t
.
f
;
i
++
)
{
if
(
!
cancelled_
.
load
(
std
::
memory_order_relaxed
))
{
t
=
q
.
PopFront
();
}
}
if
(
!
t
.
f
)
{
if
(
!
WaitForWork
(
waiter
,
&
t
))
{
return
;
}
}
if
(
t
.
f
)
{
env_
.
ExecuteTask
(
t
);
}
}
}
else
{
while
(
!
cancelled_
)
{
Task
t
=
q
.
PopFront
();
if
(
!
t
.
f
)
{
t
=
LocalSteal
();
if
(
!
t
.
f
)
{
t
=
GlobalSteal
();
if
(
!
t
.
f
)
{
// Leave one thread spinning. This reduces latency.
if
(
allow_spinning_
&&
!
spinning_
&&
!
spinning_
.
exchange
(
true
))
{
for
(
int
i
=
0
;
i
<
spin_count
&&
!
t
.
f
;
i
++
)
{
if
(
!
cancelled_
.
load
(
std
::
memory_order_relaxed
))
{
t
=
GlobalSteal
();
}
else
{
return
;
}
}
spinning_
=
false
;
}
if
(
!
t
.
f
)
{
if
(
!
WaitForWork
(
waiter
,
&
t
))
{
return
;
}
}
}
}
}
if
(
t
.
f
)
{
env_
.
ExecuteTask
(
t
);
}
}
}
}
// Steal tries to steal work from other worker threads in the range [start,
// limit) in best-effort manner.
Task
Steal
(
unsigned
start
,
unsigned
limit
)
{
PerThread
*
pt
=
GetPerThread
();
const
size_t
size
=
limit
-
start
;
unsigned
r
=
Rand
(
&
pt
->
rand
);
// Reduce r into [0, size) range, this utilizes trick from
// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
assert
(
all_coprimes_
[
size
-
1
].
size
()
<
(
1
<<
30
));
unsigned
victim
=
((
uint64_t
)
r
*
(
uint64_t
)
size
)
>>
32
;
unsigned
index
=
((
uint64_t
)
all_coprimes_
[
size
-
1
].
size
()
*
(
uint64_t
)
r
)
>>
32
;
unsigned
inc
=
all_coprimes_
[
size
-
1
][
index
];
for
(
unsigned
i
=
0
;
i
<
size
;
i
++
)
{
assert
(
start
+
victim
<
limit
);
Task
t
=
thread_data_
[
start
+
victim
].
queue
.
PopBack
();
if
(
t
.
f
)
{
return
t
;
}
victim
+=
inc
;
if
(
victim
>=
size
)
{
victim
-=
size
;
}
}
return
Task
();
}
// Steals work within threads belonging to the partition.
Task
LocalSteal
()
{
PerThread
*
pt
=
GetPerThread
();
unsigned
partition
=
GetStealPartition
(
pt
->
thread_id
);
// If thread steal partition is the same as global partition, there is no
// need to go through the steal loop twice.
if
(
global_steal_partition_
==
partition
)
return
Task
();
unsigned
start
,
limit
;
DecodePartition
(
partition
,
&
start
,
&
limit
);
AssertBounds
(
start
,
limit
);
return
Steal
(
start
,
limit
);
}
// Steals work from any other thread in the pool.
Task
GlobalSteal
()
{
return
Steal
(
0
,
num_threads_
);
}
// WaitForWork blocks until new work is available (returns true), or if it is
// time to exit (returns false). Can optionally return a task to execute in t
// (in such case t.f != nullptr on return).
bool
WaitForWork
(
EventCount
::
Waiter
*
waiter
,
Task
*
t
)
{
assert
(
t
!=
nullptr
&&
!
t
->
f
);
// We already did best-effort emptiness check in Steal, so prepare for
// blocking.
ec_
.
Prewait
();
// Now do a reliable emptiness check.
int
victim
=
NonEmptyQueueIndex
();
if
(
victim
!=
-
1
)
{
ec_
.
CancelWait
();
if
(
cancelled_
)
{
return
false
;
}
else
{
*
t
=
thread_data_
[
victim
].
queue
.
PopBack
();
return
true
;
}
}
// Number of blocked threads is used as termination condition.
// If we are shutting down and all worker threads blocked without work,
// that's we are done.
blocked_
++
;
if
(
done_
&&
blocked_
==
static_cast
<
unsigned
>
(
num_threads_
))
{
ec_
.
CancelWait
();
// Almost done, but need to re-check queues.
// Consider that all queues are empty and all worker threads are preempted
// right after incrementing blocked_ above. Now a free-standing thread
// submits work and calls destructor (which sets done_). If we don't
// re-check queues, we will exit leaving the work unexecuted.
if
(
NonEmptyQueueIndex
()
!=
-
1
)
{
// Note: we must not pop from queues before we decrement blocked_,
// otherwise the following scenario is possible. Consider that instead
// of checking for emptiness we popped the only element from queues.
// Now other worker threads can start exiting, which is bad if the
// work item submits other work. So we just check emptiness here,
// which ensures that all worker threads exit at the same time.
blocked_
--
;
return
true
;
}
// Reached stable termination state.
ec_
.
Notify
(
true
);
return
false
;
}
ec_
.
CommitWait
(
waiter
);
blocked_
--
;
return
true
;
}
int
NonEmptyQueueIndex
()
{
PerThread
*
pt
=
GetPerThread
();
// We intentionally design NonEmptyQueueIndex to steal work from
// anywhere in the queue so threads don't block in WaitForWork() forever
// when all threads in their partition go to sleep. Steal is still local.
const
size_t
size
=
thread_data_
.
size
();
unsigned
r
=
Rand
(
&
pt
->
rand
);
unsigned
inc
=
all_coprimes_
[
size
-
1
][
r
%
all_coprimes_
[
size
-
1
].
size
()];
unsigned
victim
=
r
%
size
;
for
(
unsigned
i
=
0
;
i
<
size
;
i
++
)
{
if
(
!
thread_data_
[
victim
].
queue
.
Empty
())
{
return
victim
;
}
victim
+=
inc
;
if
(
victim
>=
size
)
{
victim
-=
size
;
}
}
return
-
1
;
}
static
inline
uint64_t
GlobalThreadIdHash
()
{
return
std
::
hash
<
std
::
thread
::
id
>
()(
std
::
this_thread
::
get_id
());
}
inline
PerThread
*
GetPerThread
()
{
static
thread_local
PerThread
per_thread_
;
PerThread
*
pt
=
&
per_thread_
;
return
pt
;
}
static
inline
unsigned
Rand
(
uint64_t
*
state
)
{
uint64_t
current
=
*
state
;
// Update the internal state
*
state
=
current
*
6364136223846793005ULL
+
0xda3e39cb94b95bdbULL
;
// Generate the random output (using the PCG-XSH-RS scheme)
return
static_cast
<
unsigned
>
((
current
^
(
current
>>
22
))
>>
(
22
+
(
current
>>
61
)));
}
};
using
NonblockingThreadPool
=
ThreadPoolTempl
<
StlThreadEnvironment
>
;
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/new_executor/run_queue.h
0 → 100644
浏览文件 @
751a7942
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// RunQueue is a fixed-size, partially non-blocking deque or Work items.
// Operations on front of the queue must be done by a single thread (owner),
// operations on back of the queue can be done by multiple threads concurrently.
//
// Algorithm outline:
// All remote threads operating on the queue back are serialized by a mutex.
// This ensures that at most two threads access state: owner and one remote
// thread (Size aside). The algorithm ensures that the occupied region of the
// underlying array is logically continuous (can wraparound, but no stray
// occupied elements). Owner operates on one end of this region, remote thread
// operates on the other end. Synchronization between these threads
// (potential consumption of the last element and take up of the last empty
// element) happens by means of state variable in each element. States are:
// empty, busy (in process of insertion of removal) and ready. Threads claim
// elements (empty->busy and ready->busy transitions) by means of a CAS
// operation. The finishing transition (busy->empty and busy->ready) are done
// with plain store as the element is exclusively owned by the current thread.
//
// Note: we could permit only pointers as elements, then we would not need
// separate state variable as null/non-null pointer value would serve as state,
// but that would require malloc/free per operation for large, complex values
// (and this is designed to store std::function<()>).
#pragma once
#include <atomic>
#include <cassert>
#include <cstdint>
#include <mutex>
#include <vector>
namespace
paddle
{
namespace
framework
{
template
<
typename
Work
,
unsigned
kSize
>
class
RunQueue
{
public:
RunQueue
()
:
front_
(
0
),
back_
(
0
)
{
// require power-of-two for fast masking
static_assert
((
kSize
&
(
kSize
-
1
))
==
0
,
"need to be a power of two for fast masking"
);
static_assert
(
kSize
>
2
,
"need to be in [4, 65536] range to leave enough space for counter"
);
static_assert
(
kSize
<=
(
64
<<
10
),
"need to be in [4, 65536] range to leave enough space for counter"
);
for
(
unsigned
i
=
0
;
i
<
kSize
;
i
++
)
array_
[
i
].
state
.
store
(
kEmpty
,
std
::
memory_order_relaxed
);
}
RunQueue
(
const
RunQueue
&
)
=
delete
;
void
operator
=
(
const
RunQueue
&
)
=
delete
;
~
RunQueue
()
{
assert
(
Size
()
==
0
);
}
// PushFront inserts w at the beginning of the queue.
// If queue is full returns w, otherwise returns default-constructed Work.
Work
PushFront
(
Work
w
)
{
unsigned
front
=
front_
.
load
(
std
::
memory_order_relaxed
);
Elem
*
e
=
&
array_
[
front
&
kMask
];
uint8_t
s
=
e
->
state
.
load
(
std
::
memory_order_relaxed
);
if
(
s
!=
kEmpty
||
!
e
->
state
.
compare_exchange_strong
(
s
,
kBusy
,
std
::
memory_order_acquire
))
{
return
w
;
}
front_
.
store
(
front
+
1
+
(
kSize
<<
1
),
std
::
memory_order_relaxed
);
e
->
w
=
std
::
move
(
w
);
e
->
state
.
store
(
kReady
,
std
::
memory_order_release
);
return
Work
();
}
// PopFront removes and returns the first element in the queue.
// If the queue was empty returns default-constructed Work.
Work
PopFront
()
{
unsigned
front
=
front_
.
load
(
std
::
memory_order_relaxed
);
Elem
*
e
=
&
array_
[(
front
-
1
)
&
kMask
];
uint8_t
s
=
e
->
state
.
load
(
std
::
memory_order_relaxed
);
if
(
s
!=
kReady
||
!
e
->
state
.
compare_exchange_strong
(
s
,
kBusy
,
std
::
memory_order_acquire
))
{
return
Work
();
}
Work
w
=
std
::
move
(
e
->
w
);
e
->
state
.
store
(
kEmpty
,
std
::
memory_order_release
);
front
=
((
front
-
1
)
&
kMask2
)
|
(
front
&
~
kMask2
);
front_
.
store
(
front
,
std
::
memory_order_relaxed
);
return
w
;
}
// PushBack adds w at the end of the queue.
// If queue is full returns w, otherwise returns default-constructed Work.
Work
PushBack
(
Work
w
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
unsigned
back
=
back_
.
load
(
std
::
memory_order_relaxed
);
Elem
*
e
=
&
array_
[(
back
-
1
)
&
kMask
];
uint8_t
s
=
e
->
state
.
load
(
std
::
memory_order_relaxed
);
if
(
s
!=
kEmpty
||
!
e
->
state
.
compare_exchange_strong
(
s
,
kBusy
,
std
::
memory_order_acquire
))
{
return
w
;
}
back
=
((
back
-
1
)
&
kMask2
)
|
(
back
&
~
kMask2
);
back_
.
store
(
back
,
std
::
memory_order_relaxed
);
e
->
w
=
std
::
move
(
w
);
e
->
state
.
store
(
kReady
,
std
::
memory_order_release
);
return
Work
();
}
// PopBack removes and returns the last elements in the queue.
Work
PopBack
()
{
if
(
Empty
())
{
return
Work
();
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
unsigned
back
=
back_
.
load
(
std
::
memory_order_relaxed
);
Elem
*
e
=
&
array_
[
back
&
kMask
];
uint8_t
s
=
e
->
state
.
load
(
std
::
memory_order_relaxed
);
if
(
s
!=
kReady
||
!
e
->
state
.
compare_exchange_strong
(
s
,
kBusy
,
std
::
memory_order_acquire
))
{
return
Work
();
}
Work
w
=
std
::
move
(
e
->
w
);
e
->
state
.
store
(
kEmpty
,
std
::
memory_order_release
);
back_
.
store
(
back
+
1
+
(
kSize
<<
1
),
std
::
memory_order_relaxed
);
return
w
;
}
// PopBackHalf removes and returns half last elements in the queue.
// Returns number of elements removed.
unsigned
PopBackHalf
(
std
::
vector
<
Work
>*
result
)
{
if
(
Empty
())
{
return
0
;
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
unsigned
back
=
back_
.
load
(
std
::
memory_order_relaxed
);
unsigned
size
=
Size
();
unsigned
mid
=
back
;
if
(
size
>
1
)
mid
=
back
+
(
size
-
1
)
/
2
;
unsigned
n
=
0
;
unsigned
start
=
0
;
for
(;
static_cast
<
int
>
(
mid
-
back
)
>=
0
;
mid
--
)
{
Elem
*
e
=
&
array_
[
mid
&
kMask
];
uint8_t
s
=
e
->
state
.
load
(
std
::
memory_order_relaxed
);
if
(
n
==
0
)
{
if
(
s
!=
kReady
||
!
e
->
state
.
compare_exchange_strong
(
s
,
kBusy
,
std
::
memory_order_acquire
))
continue
;
start
=
mid
;
}
else
{
// Note: no need to store temporal kBusy, we exclusively own these
// elements.
assert
(
s
==
kReady
);
}
result
->
push_back
(
std
::
move
(
e
->
w
));
e
->
state
.
store
(
kEmpty
,
std
::
memory_order_release
);
n
++
;
}
if
(
n
!=
0
)
{
back_
.
store
(
start
+
1
+
(
kSize
<<
1
),
std
::
memory_order_relaxed
);
}
return
n
;
}
// Size returns current queue size.
// Can be called by any thread at any time.
unsigned
Size
()
const
{
return
SizeOrNotEmpty
<
true
>
();
}
// Empty tests whether container is empty.
// Can be called by any thread at any time.
bool
Empty
()
const
{
return
SizeOrNotEmpty
<
false
>
()
==
0
;
}
// Delete all the elements from the queue.
void
Flush
()
{
while
(
!
Empty
())
{
PopFront
();
}
}
private:
static
const
unsigned
kMask
=
kSize
-
1
;
static
const
unsigned
kMask2
=
(
kSize
<<
1
)
-
1
;
struct
Elem
{
std
::
atomic
<
uint8_t
>
state
;
Work
w
;
};
enum
{
kEmpty
,
kBusy
,
kReady
,
};
std
::
mutex
mutex_
;
// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
// front/back, respectively. The remaining bits contain modification counters
// that are incremented on Push operations. This allows us to (1) distinguish
// between empty and full conditions (if we would use log(kSize) bits for
// position, these conditions would be indistinguishable); (2) obtain
// consistent snapshot of front_/back_ for Size operation using the
// modification counters.
std
::
atomic
<
unsigned
>
front_
;
std
::
atomic
<
unsigned
>
back_
;
Elem
array_
[
kSize
];
// SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
// only whether the size is 0 is guaranteed to be correct.
// Can be called by any thread at any time.
template
<
bool
NeedSizeEstimate
>
unsigned
SizeOrNotEmpty
()
const
{
// Emptiness plays critical role in thread pool blocking. So we go to great
// effort to not produce false positives (claim non-empty queue as empty).
unsigned
front
=
front_
.
load
(
std
::
memory_order_acquire
);
for
(;;)
{
// Capture a consistent snapshot of front/tail.
unsigned
back
=
back_
.
load
(
std
::
memory_order_acquire
);
unsigned
front1
=
front_
.
load
(
std
::
memory_order_relaxed
);
if
(
front
!=
front1
)
{
front
=
front1
;
std
::
atomic_thread_fence
(
std
::
memory_order_acquire
);
continue
;
}
if
(
NeedSizeEstimate
)
{
return
CalculateSize
(
front
,
back
);
}
else
{
// This value will be 0 if the queue is empty, and undefined otherwise.
unsigned
maybe_zero
=
((
front
^
back
)
&
kMask2
);
// Queue size estimate must agree with maybe zero check on the queue
// empty/non-empty state.
assert
((
CalculateSize
(
front
,
back
)
==
0
)
==
(
maybe_zero
==
0
));
return
maybe_zero
;
}
}
}
inline
unsigned
CalculateSize
(
unsigned
front
,
unsigned
back
)
const
{
int
size
=
(
front
&
kMask2
)
-
(
back
&
kMask2
);
// Fix overflow.
if
(
size
<
0
)
{
size
+=
2
*
kSize
;
}
// Order of modification in push/pop is crafted to make the queue look
// larger than it is during concurrent modifications. E.g. push can
// increment size before the corresponding pop has decremented it.
// So the computed size can be up to kSize + 1, fix it.
if
(
size
>
static_cast
<
int
>
(
kSize
))
{
size
=
kSize
;
}
return
static_cast
<
unsigned
>
(
size
);
}
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/new_executor/thread_environment.h
0 → 100644
浏览文件 @
751a7942
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#pragma once
#include <functional>
#include <thread>
namespace
paddle
{
namespace
framework
{
struct
StlThreadEnvironment
{
struct
Task
{
std
::
function
<
void
()
>
f
;
};
// EnvThread constructor must start the thread,
// destructor must join the thread.
class
EnvThread
{
public:
explicit
EnvThread
(
std
::
function
<
void
()
>
f
)
:
thr_
(
std
::
move
(
f
))
{}
~
EnvThread
()
{
thr_
.
join
();
}
private:
std
::
thread
thr_
;
};
EnvThread
*
CreateThread
(
std
::
function
<
void
()
>
f
)
{
return
new
EnvThread
(
std
::
move
(
f
));
}
Task
CreateTask
(
std
::
function
<
void
()
>
f
)
{
return
Task
{
std
::
move
(
f
)};
}
void
ExecuteTask
(
const
Task
&
t
)
{
t
.
f
();
}
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/new_executor/workqueue.cc
0 → 100644
浏览文件 @
751a7942
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "paddle/fluid/framework/new_executor/workqueue.h"
#include "paddle/fluid/framework/new_executor/nonblocking_threadpool.h"
namespace
paddle
{
namespace
framework
{
class
SingleThreadedWorkQueue
:
public
WorkQueue
{
public:
SingleThreadedWorkQueue
()
:
queue_
(
1
)
{}
SingleThreadedWorkQueue
(
const
SingleThreadedWorkQueue
&
)
=
delete
;
SingleThreadedWorkQueue
&
operator
=
(
const
SingleThreadedWorkQueue
&
)
=
delete
;
virtual
~
SingleThreadedWorkQueue
()
=
default
;
void
AddTask
(
std
::
function
<
void
()
>
fn
)
override
{
queue_
.
AddTask
(
std
::
move
(
fn
));
}
void
WaitQueueEmpty
()
override
{
queue_
.
WaitQueueEmpty
();
}
size_t
NumThreads
()
override
{
return
queue_
.
NumThreads
();
}
private:
NonblockingThreadPool
queue_
;
};
std
::
unique_ptr
<
WorkQueue
>
CreateSingleThreadedWorkQueue
()
{
std
::
unique_ptr
<
WorkQueue
>
ptr
(
new
SingleThreadedWorkQueue
);
return
std
::
move
(
ptr
);
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/new_executor/workqueue.h
0 → 100644
浏览文件 @
751a7942
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
namespace
paddle
{
namespace
framework
{
class
WorkQueue
{
public:
WorkQueue
()
=
default
;
WorkQueue
(
const
WorkQueue
&
)
=
delete
;
WorkQueue
&
operator
=
(
const
WorkQueue
&
)
=
delete
;
virtual
~
WorkQueue
()
=
default
;
virtual
void
AddTask
(
std
::
function
<
void
()
>
fn
)
=
0
;
virtual
void
WaitQueueEmpty
()
=
0
;
virtual
size_t
NumThreads
()
=
0
;
};
std
::
unique_ptr
<
WorkQueue
>
CreateSingleThreadedWorkQueue
();
std
::
unique_ptr
<
WorkQueue
>
CreateMultiThreadedWorkQueue
(
int
num_threads
);
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/new_executor/workqueue_test.cc
0 → 100644
浏览文件 @
751a7942
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/new_executor/workqueue.h"
#include <atomic>
#include "glog/logging.h"
#include "gtest/gtest.h"
TEST
(
WorkQueue
,
TestSingleThreadedWorkQueue
)
{
VLOG
(
1
)
<<
"In Test"
;
using
paddle
::
framework
::
WorkQueue
;
using
paddle
::
framework
::
CreateSingleThreadedWorkQueue
;
std
::
atomic
<
bool
>
finished
{
false
};
std
::
atomic
<
unsigned
>
counter
{
0
};
constexpr
unsigned
kLoopNum
=
1000000
;
// CreateSingleThreadedWorkQueue
std
::
unique_ptr
<
WorkQueue
>
work_queue
=
CreateSingleThreadedWorkQueue
();
// NumThreads
EXPECT_EQ
(
work_queue
->
NumThreads
(),
1u
);
// AddTask
EXPECT_EQ
(
finished
.
load
(),
false
);
EXPECT_EQ
(
counter
.
load
(),
0u
);
work_queue
->
AddTask
([
&
counter
,
&
finished
,
kLoopNum
]()
{
for
(
unsigned
i
=
0
;
i
<
kLoopNum
;
++
i
)
{
++
counter
;
}
finished
=
true
;
});
// WaitQueueEmpty
EXPECT_EQ
(
finished
.
load
(),
false
);
work_queue
->
WaitQueueEmpty
();
EXPECT_EQ
(
finished
.
load
(),
true
);
EXPECT_EQ
(
counter
.
load
(),
kLoopNum
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录