提交 4e91d8d2 编写于 作者: W WangZhen

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into graph_quantization

test=develop
...@@ -37,7 +37,7 @@ else() ...@@ -37,7 +37,7 @@ else()
variable_response.cc variable_response.cc
collective_client.cc collective_server.cc collective_client.cc collective_server.cc
${BRPC_SRCS} ${BRPC_SRCS}
PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto PROTO send_recv.proto
DEPS lod_tensor selected_rows memory) DEPS lod_tensor selected_rows memory)
set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
......
...@@ -21,20 +21,20 @@ namespace operators { ...@@ -21,20 +21,20 @@ namespace operators {
enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
#define CHECK_CASE(i, flags, kernel_name, args...) \ #define CHECK_CASE(i, flags, kernel_name, ...) \
if (i == flags) { \ if (i == flags) { \
kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(args); \ kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
} }
// 0 for no scale, no bias // 0 for no scale, no bias
// 1 for has scale, no bias // 1 for has scale, no bias
// 2 for no scale, has bias // 2 for no scale, has bias
// 3 for has scale, has bias // 3 for has scale, has bias
#define UNROLL_ALL_CASES(flags, kernel_name, args...) \ #define UNROLL_ALL_CASES(flags, kernel_name, ...) \
CHECK_CASE(0, flags, kernel_name, args) \ CHECK_CASE(0, flags, kernel_name, __VA_ARGS__) \
CHECK_CASE(1, flags, kernel_name, args) \ CHECK_CASE(1, flags, kernel_name, __VA_ARGS__) \
CHECK_CASE(2, flags, kernel_name, args) \ CHECK_CASE(2, flags, kernel_name, __VA_ARGS__) \
CHECK_CASE(3, flags, kernel_name, args) CHECK_CASE(3, flags, kernel_name, __VA_ARGS__)
template <typename T> template <typename T>
__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
......
...@@ -305,7 +305,9 @@ class Executor(object): ...@@ -305,7 +305,9 @@ class Executor(object):
def __init__(self, place): def __init__(self, place):
self.place = place self.place = place
self.program_caches = dict() self.program_caches = dict()
self.executor = None p = core.Place()
p.set_place(self.place)
self._default_executor = core.Executor(p)
self._closed = False self._closed = False
def _get_program_cache(self, program_cache_key): def _get_program_cache(self, program_cache_key):
...@@ -397,12 +399,13 @@ class Executor(object): ...@@ -397,12 +399,13 @@ class Executor(object):
>>> ... >>> ...
>>> exe.close() >>> exe.close()
""" """
if not self._closed and self.executor: if not self._closed:
self.executor.close() self._default_executor.close()
self._closed = True self._closed = True
def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
return_numpy): return_numpy):
exe = program._executor
if isinstance(feed, dict): if isinstance(feed, dict):
feed_tensor_dict = dict() feed_tensor_dict = dict()
for feed_name in feed: for feed_name in feed:
...@@ -414,8 +417,7 @@ class Executor(object): ...@@ -414,8 +417,7 @@ class Executor(object):
feed_tensor.set(feed[feed_name], core.CPUPlace()) feed_tensor.set(feed[feed_name], core.CPUPlace())
feed_tensor_dict[feed_name] = feed_tensor feed_tensor_dict[feed_name] = feed_tensor
self.executor.feed_and_split_tensor_into_local_scopes( exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
feed_tensor_dict)
elif isinstance(feed, list) or isinstance(feed, tuple): elif isinstance(feed, list) or isinstance(feed, tuple):
if len(feed) != len(program._places): if len(feed) != len(program._places):
raise ValueError( raise ValueError(
...@@ -436,10 +438,10 @@ class Executor(object): ...@@ -436,10 +438,10 @@ class Executor(object):
tensor = tmp tensor = tmp
res_dict[feed_name] = tensor res_dict[feed_name] = tensor
res.append(res_dict) res.append(res_dict)
self.executor.feed_tensors_into_local_scopes(res) exe.feed_tensors_into_local_scopes(res)
fetch_var_names = list(map(_to_name_str, fetch_list)) fetch_var_names = list(map(_to_name_str, fetch_list))
self.executor.run(fetch_var_names, fetch_var_name) exe.run(fetch_var_names, fetch_var_name)
arr = scope.find_var(fetch_var_name).get_lod_tensor_array() arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
if return_numpy: if return_numpy:
...@@ -511,12 +513,9 @@ class Executor(object): ...@@ -511,12 +513,9 @@ class Executor(object):
compiled = isinstance(program, compiler.CompiledProgram) compiled = isinstance(program, compiler.CompiledProgram)
# For backward compatibility, run directly. # For backward compatibility, run directly.
if not compiled: if not compiled:
if not self.executor:
p = core.Place()
p.set_place(self.place)
self.executor = core.Executor(p)
return self._run( return self._run(
program, program,
self._default_executor,
feed=feed, feed=feed,
fetch_list=fetch_list, fetch_list=fetch_list,
feed_var_name=feed_var_name, feed_var_name=feed_var_name,
...@@ -526,7 +525,6 @@ class Executor(object): ...@@ -526,7 +525,6 @@ class Executor(object):
use_program_cache=use_program_cache) use_program_cache=use_program_cache)
program._compile(scope, self.place) program._compile(scope, self.place)
self.executor = program._executor
if program._is_data_parallel: if program._is_data_parallel:
return self._run_parallel( return self._run_parallel(
program, program,
...@@ -536,12 +534,13 @@ class Executor(object): ...@@ -536,12 +534,13 @@ class Executor(object):
fetch_var_name=fetch_var_name, fetch_var_name=fetch_var_name,
return_numpy=return_numpy) return_numpy=return_numpy)
elif program._is_inference: elif program._is_inference:
return self._run_inference(program, feed) return self._run_inference(program._executor, feed)
else: else:
# TODO(panyx0718): Can compile program to optimize executor # TODO(panyx0718): Can compile program to optimize executor
# performance. # performance.
return self._run( return self._run(
program._program, program._program,
self._default_executor,
feed=feed, feed=feed,
fetch_list=fetch_list, fetch_list=fetch_list,
feed_var_name=feed_var_name, feed_var_name=feed_var_name,
...@@ -550,8 +549,8 @@ class Executor(object): ...@@ -550,8 +549,8 @@ class Executor(object):
return_numpy=return_numpy, return_numpy=return_numpy,
use_program_cache=use_program_cache) use_program_cache=use_program_cache)
def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, def _run(self, program, exe, feed, fetch_list, feed_var_name,
scope, return_numpy, use_program_cache): fetch_var_name, scope, return_numpy, use_program_cache):
if feed is None: if feed is None:
feed = {} feed = {}
...@@ -589,11 +588,11 @@ class Executor(object): ...@@ -589,11 +588,11 @@ class Executor(object):
fetch_var_name=fetch_var_name) fetch_var_name=fetch_var_name)
self._feed_data(program, feed, feed_var_name, scope) self._feed_data(program, feed, feed_var_name, scope)
self.executor.run(program.desc, scope, 0, True, True) exe.run(program.desc, scope, 0, True, True)
outs = self._fetch_data(fetch_list, fetch_var_name, scope) outs = self._fetch_data(fetch_list, fetch_var_name, scope)
if return_numpy: if return_numpy:
outs = as_numpy(outs) outs = as_numpy(outs)
return outs return outs
def _run_inference(self, program, feed): def _run_inference(self, exe, feed):
return self.executor.run(feed) return exe.run(feed)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册