提交 4e91d8d2 编写于 作者: W WangZhen

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into graph_quantization

test=develop
......@@ -37,7 +37,7 @@ else()
variable_response.cc
collective_client.cc collective_server.cc
${BRPC_SRCS}
PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory)
set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
......
......@@ -21,20 +21,20 @@ namespace operators {
enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
#define CHECK_CASE(i, flags, kernel_name, args...) \
#define CHECK_CASE(i, flags, kernel_name, ...) \
if (i == flags) { \
kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(args); \
kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
}
// 0 for no scale, no bias
// 1 for has scale, no bias
// 2 for no scale, has bias
// 3 for has scale, has bias
#define UNROLL_ALL_CASES(flags, kernel_name, args...) \
CHECK_CASE(0, flags, kernel_name, args) \
CHECK_CASE(1, flags, kernel_name, args) \
CHECK_CASE(2, flags, kernel_name, args) \
CHECK_CASE(3, flags, kernel_name, args)
#define UNROLL_ALL_CASES(flags, kernel_name, ...) \
CHECK_CASE(0, flags, kernel_name, __VA_ARGS__) \
CHECK_CASE(1, flags, kernel_name, __VA_ARGS__) \
CHECK_CASE(2, flags, kernel_name, __VA_ARGS__) \
CHECK_CASE(3, flags, kernel_name, __VA_ARGS__)
template <typename T>
__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
......
......@@ -305,7 +305,9 @@ class Executor(object):
def __init__(self, place):
self.place = place
self.program_caches = dict()
self.executor = None
p = core.Place()
p.set_place(self.place)
self._default_executor = core.Executor(p)
self._closed = False
def _get_program_cache(self, program_cache_key):
......@@ -397,12 +399,13 @@ class Executor(object):
>>> ...
>>> exe.close()
"""
if not self._closed and self.executor:
self.executor.close()
if not self._closed:
self._default_executor.close()
self._closed = True
def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
return_numpy):
exe = program._executor
if isinstance(feed, dict):
feed_tensor_dict = dict()
for feed_name in feed:
......@@ -414,8 +417,7 @@ class Executor(object):
feed_tensor.set(feed[feed_name], core.CPUPlace())
feed_tensor_dict[feed_name] = feed_tensor
self.executor.feed_and_split_tensor_into_local_scopes(
feed_tensor_dict)
exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
elif isinstance(feed, list) or isinstance(feed, tuple):
if len(feed) != len(program._places):
raise ValueError(
......@@ -436,10 +438,10 @@ class Executor(object):
tensor = tmp
res_dict[feed_name] = tensor
res.append(res_dict)
self.executor.feed_tensors_into_local_scopes(res)
exe.feed_tensors_into_local_scopes(res)
fetch_var_names = list(map(_to_name_str, fetch_list))
self.executor.run(fetch_var_names, fetch_var_name)
exe.run(fetch_var_names, fetch_var_name)
arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
if return_numpy:
......@@ -511,12 +513,9 @@ class Executor(object):
compiled = isinstance(program, compiler.CompiledProgram)
# For backward compatibility, run directly.
if not compiled:
if not self.executor:
p = core.Place()
p.set_place(self.place)
self.executor = core.Executor(p)
return self._run(
program,
self._default_executor,
feed=feed,
fetch_list=fetch_list,
feed_var_name=feed_var_name,
......@@ -526,7 +525,6 @@ class Executor(object):
use_program_cache=use_program_cache)
program._compile(scope, self.place)
self.executor = program._executor
if program._is_data_parallel:
return self._run_parallel(
program,
......@@ -536,12 +534,13 @@ class Executor(object):
fetch_var_name=fetch_var_name,
return_numpy=return_numpy)
elif program._is_inference:
return self._run_inference(program, feed)
return self._run_inference(program._executor, feed)
else:
# TODO(panyx0718): Can compile program to optimize executor
# performance.
return self._run(
program._program,
self._default_executor,
feed=feed,
fetch_list=fetch_list,
feed_var_name=feed_var_name,
......@@ -550,8 +549,8 @@ class Executor(object):
return_numpy=return_numpy,
use_program_cache=use_program_cache)
def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name,
scope, return_numpy, use_program_cache):
def _run(self, program, exe, feed, fetch_list, feed_var_name,
fetch_var_name, scope, return_numpy, use_program_cache):
if feed is None:
feed = {}
......@@ -589,11 +588,11 @@ class Executor(object):
fetch_var_name=fetch_var_name)
self._feed_data(program, feed, feed_var_name, scope)
self.executor.run(program.desc, scope, 0, True, True)
exe.run(program.desc, scope, 0, True, True)
outs = self._fetch_data(fetch_list, fetch_var_name, scope)
if return_numpy:
outs = as_numpy(outs)
return outs
def _run_inference(self, program, feed):
return self.executor.run(feed)
def _run_inference(self, exe, feed):
return exe.run(feed)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册