提交 17f44384 编写于 作者: D dangqingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imdb_data

...@@ -16,8 +16,6 @@ cmake_minimum_required(VERSION 3.0) ...@@ -16,8 +16,6 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
include(system) include(system)
...@@ -201,6 +199,10 @@ if(WITH_GOLANG) ...@@ -201,6 +199,10 @@ if(WITH_GOLANG)
endif(WITH_GOLANG) endif(WITH_GOLANG)
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
add_subdirectory(paddle) add_subdirectory(paddle)
if(WITH_PYTHON) if(WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
......
...@@ -6,8 +6,18 @@ height = 227 ...@@ -6,8 +6,18 @@ height = 227
width = 227 width = 227
num_class = 1000 num_class = 1000
batch_size = get_config_arg('batch_size', int, 128) batch_size = get_config_arg('batch_size', int, 128)
gp = get_config_arg('layer_num', int, 1)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2( define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args) "train.list", None, module="provider", obj="process", args=args)
...@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2) ...@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2 # conv2
net = img_conv_layer( net = img_conv_layer(
input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1) input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75) net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2) net = img_pool_layer(input=net, pool_size=3, stride=2)
...@@ -40,11 +50,11 @@ net = img_conv_layer( ...@@ -40,11 +50,11 @@ net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1) input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4 # conv4
net = img_conv_layer( net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1) input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
# conv5 # conv5
net = img_conv_layer( net = img_conv_layer(
input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1) input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
net = img_pool_layer(input=net, pool_size=3, stride=2) net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer( net = fc_layer(
...@@ -59,6 +69,9 @@ net = fc_layer( ...@@ -59,6 +69,9 @@ net = fc_layer(
layer_attr=ExtraAttr(drop_rate=0.5)) layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation()) net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
lab = data_layer('label', num_class) if is_infer:
loss = cross_entropy(input=net, label=lab) outputs(net)
outputs(loss) else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=net, label=lab)
outputs(loss)
...@@ -7,13 +7,15 @@ num_class = 1000 ...@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 128) batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True) use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False) is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = { args = {
'height': height, 'height': height,
'width': width, 'width': width,
'color': True, 'color': True,
'num_class': num_class, 'num_class': num_class,
'is_infer': is_infer 'is_infer': is_infer,
'num_samples': num_samples
} }
define_py_data_sources2( define_py_data_sources2(
"train.list" if not is_infer else None, "train.list" if not is_infer else None,
......
...@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs): ...@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
else: else:
settings.data_size = settings.height * settings.width settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False) settings.is_infer = kwargs.get('is_infer', False)
settings.num_samples = kwargs.get('num_samples', 2560)
if settings.is_infer: if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)] settings.slots = [dense_vector(settings.data_size)]
else: else:
...@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs): ...@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider( @provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list): def process(settings, file_list):
for i in xrange(2560 if settings.is_infer else 1024): for i in xrange(settings.num_samples):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer: if settings.is_infer:
yield img.astype('float32') yield img.astype('float32')
......
...@@ -7,13 +7,15 @@ num_class = 1000 ...@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64) batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50) layer_num = get_config_arg("layer_num", int, 50)
is_infer = get_config_arg("is_infer", bool, False) is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = { args = {
'height': height, 'height': height,
'width': width, 'width': width,
'color': True, 'color': True,
'num_class': num_class, 'num_class': num_class,
'is_infer': is_infer 'is_infer': is_infer,
'num_samples': num_samples
} }
define_py_data_sources2( define_py_data_sources2(
"train.list" if not is_infer else None, "train.list" if not is_infer else None,
......
...@@ -37,7 +37,7 @@ function infer() { ...@@ -37,7 +37,7 @@ function infer() {
--trainer_count=1 \ --trainer_count=1 \
--num_passes=1 \ --num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \ --save_dir="models/${topology}-${layer_num}" \
--config_args="batch_size=128,layer_num=${layer_num}" \ --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
> /dev/null 2>&1 > /dev/null 2>&1
echo "Done" echo "Done"
fi fi
...@@ -79,8 +79,9 @@ fi ...@@ -79,8 +79,9 @@ fi
# inference benchmark # inference benchmark
for use_mkldnn in True False; do for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do for batchsize in 1 2 4 8 16; do
infer googlenet v1 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer vgg 19 $batchsize $use_mkldnn infer vgg 19 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer googlenet v1 $batchsize $use_mkldnn
infer alexnet 2 $batchsize $use_mkldnn
done done
done done
...@@ -47,5 +47,6 @@ for use_mkldnn in True False; do ...@@ -47,5 +47,6 @@ for use_mkldnn in True False; do
train vgg 19 $batchsize $use_mkldnn train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn train googlenet v1 $batchsize $use_mkldnn
train alexnet 2 $batchsize $use_mkldnn
done done
done done
...@@ -23,24 +23,25 @@ function infer() { ...@@ -23,24 +23,25 @@ function infer() {
echo "./run_mkl_infer.sh to save the model first" echo "./run_mkl_infer.sh to save the model first"
exit 0 exit 0
fi fi
log_period=$((256 / bs)) log_period=$((32 / bs))
paddle train --job=test \ paddle train --job=test \
--config="${topology}.py" \ --config="${topology}.py" \
--use_mkldnn=False \
--use_gpu=False \ --use_gpu=False \
--trainer_count=$thread \ --trainer_count=$thread \
--log_period=$log_period \ --log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \ --init_model_path=$models_in \
2>&1 | tee ${log} 2>&1 | tee ${log}
# calculate the last 5 logs period time of 1280 samples, # calculate the last 5 logs period time of 160(=32*5) samples,
# the time before are burning time. # the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start` start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end` end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'` fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
} }
...@@ -56,7 +57,8 @@ fi ...@@ -56,7 +57,8 @@ fi
# inference benchmark # inference benchmark
for batchsize in 1 2 4 8 16; do for batchsize in 1 2 4 8 16; do
infer googlenet v1 $batchsize
infer resnet 50 $batchsize
infer vgg 19 $batchsize infer vgg 19 $batchsize
infer resnet 50 $batchsize
infer googlenet v1 $batchsize
infer alexnet 2 $batchsize
done done
...@@ -12,10 +12,11 @@ function train() { ...@@ -12,10 +12,11 @@ function train() {
config="${topology}.py" config="${topology}.py"
paddle train --job=time \ paddle train --job=time \
--config=$config \ --config=$config \
--use_mkldnn=False \
--use_gpu=False \ --use_gpu=False \
--trainer_count=$thread \ --trainer_count=$thread \
--log_period=10 \ --log_period=3 \
--test_period=100 \ --test_period=30 \
--config_args=$args \ --config_args=$args \
2>&1 | tee ${log} 2>&1 | tee ${log}
...@@ -36,4 +37,5 @@ for batchsize in 64 128 256; do ...@@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
train vgg 19 $batchsize train vgg 19 $batchsize
train resnet 50 $batchsize train resnet 50 $batchsize
train googlenet v1 $batchsize train googlenet v1 $batchsize
train alexnet 2 $batchsize
done done
...@@ -7,13 +7,15 @@ num_class = 1000 ...@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64) batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19) layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False) is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = { args = {
'height': height, 'height': height,
'width': width, 'width': width,
'color': True, 'color': True,
'num_class': num_class, 'num_class': num_class,
'is_infer': is_infer 'is_infer': is_infer,
'num_samples': num_samples
} }
define_py_data_sources2( define_py_data_sources2(
"train.list" if not is_infer else None, "train.list" if not is_infer else None,
......
...@@ -170,6 +170,18 @@ sequence_pool ...@@ -170,6 +170,18 @@ sequence_pool
:noindex: :noindex:
sequence_first_step
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex:
sequence_last_step
------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
:noindex:
pool2d pool2d
------ ------
.. autofunction:: paddle.v2.fluid.layers.pool2d .. autofunction:: paddle.v2.fluid.layers.pool2d
...@@ -318,3 +330,9 @@ reduce_sum ...@@ -318,3 +330,9 @@ reduce_sum
.. autofunction:: paddle.v2.fluid.layers.reduce_sum .. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex: :noindex:
reduce_mean
---------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex:
...@@ -291,10 +291,10 @@ public: ...@@ -291,10 +291,10 @@ public:
} }
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override { const platform::Place& place) const override {
PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first."); PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
for (auto& op : runtime_table_.ops()) { for (auto& op : runtime_table_.ops()) {
op->Run(scope, dev_ctx); op->Run(scope, place);
} }
} }
......
...@@ -25,13 +25,14 @@ There are mainly three parts that we have to consider while integrating a new de ...@@ -25,13 +25,14 @@ There are mainly three parts that we have to consider while integrating a new de
### Place and DeviceContext ### Place and DeviceContext
Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
#### Place #### Place
Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`. Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
``` ```
| CPUPlace --> MKLDNNPlace | CPUPlace
Place --| CUDAPlace --> CUDNNPlace Place --| CUDAPlace
| FPGAPlace | FPGAPlace
``` ```
...@@ -43,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place; ...@@ -43,7 +44,7 @@ typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
#### DeviceContext #### DeviceContext
Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`. Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
``` ```
...@@ -106,7 +107,7 @@ template <typename Place> ...@@ -106,7 +107,7 @@ template <typename Place>
size_t Used(Place place); size_t Used(Place place);
``` ```
To implementing these interfaces, we have to implement MemoryAllocator for different Devices To implement these interfaces, we have to implement MemoryAllocator for different Devices.
#### Tensor #### Tensor
...@@ -243,6 +244,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -243,6 +244,7 @@ REGISTER_OP_CUDA_KERNEL(
Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
We will discuss how to implement an efficient OpKernel switch policy. For more details, please refer to following docs:
- TBD - operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
...@@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其 ...@@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其
:header: "依赖", "版本", "说明" :header: "依赖", "版本", "说明"
:widths: 10, 15, 30 :widths: 10, 15, 30
"CMake", ">=3.5", "" "CMake", ">=3.2", ""
"GCC", "4.8.2", "推荐使用CentOS的devtools2" "GCC", "4.8.2", "推荐使用CentOS的devtools2"
"Python", "2.7.x", "依赖libpython2.7.so" "Python", "2.7.x", "依赖libpython2.7.so"
"pip", ">=9.0", "" "pip", ">=9.0", ""
"numpy", "", "" "numpy", "", ""
"SWIG", ">=2.0", "" "SWIG", ">=2.0", ""
"Go", ">=1.8", "可选" "Go", ">=1.8", "可选"
.. _build_options: .. _build_options:
......
...@@ -76,13 +76,13 @@ will be downloaded automatically. ...@@ -76,13 +76,13 @@ will be downloaded automatically.
:header: "Dependency", "Version", "Description" :header: "Dependency", "Version", "Description"
:widths: 10, 15, 30 :widths: 10, 15, 30
"CMake", ">=3.5", "" "CMake", ">=3.2", ""
"GCC", "4.8.2", "Recommend devtools2 for CentOS" "GCC", "4.8.2", "Recommend devtools2 for CentOS"
"Python", "2.7.x", "Need libpython2.7.so" "Python", "2.7.x", "Need libpython2.7.so"
"pip", ">=9.0", "" "pip", ">=9.0", ""
"numpy", "", "" "numpy", "", ""
"SWIG", ">=2.0", "" "SWIG", ">=2.0", ""
"Go", ">=1.8", "Optional" "Go", ">=1.8", "Optional"
.. _build_options: .. _build_options:
......
...@@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) ...@@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
...@@ -59,5 +59,8 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry ...@@ -59,5 +59,8 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_library(init SRCS init.cc DEPS gflags executor place stringpiece) cc_test(threadpool_test SRCS threadpool_test.cc)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
cc_test(init_test SRCS init_test.cc DEPS init) cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context)
...@@ -90,6 +90,21 @@ OpDesc *BlockDesc::PrependOp() { ...@@ -90,6 +90,21 @@ OpDesc *BlockDesc::PrependOp() {
return ops_.front().get(); return ops_.front().get();
} }
void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
return;
}
need_update_ = true;
for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
auto names = (*it)->InputArgumentNames();
for (auto n : names) {
// TODO(typhoonzero): delete vars if no other op use it.
VLOG(3) << "deleting var " << n;
}
}
ops_.erase(ops_.begin() + s, ops_.begin() + e);
}
std::vector<OpDesc *> BlockDesc::AllOps() const { std::vector<OpDesc *> BlockDesc::AllOps() const {
std::vector<OpDesc *> res; std::vector<OpDesc *> res;
for (const auto &op : ops_) { for (const auto &op : ops_) {
......
...@@ -79,6 +79,8 @@ class BlockDesc { ...@@ -79,6 +79,8 @@ class BlockDesc {
OpDesc *PrependOp(); OpDesc *PrependOp();
void RemoveOp(size_t s, size_t e);
std::vector<OpDesc *> AllOps() const; std::vector<OpDesc *> AllOps() const;
size_t OpSize() const { return ops_.size(); } size_t OpSize() const { return ops_.size(); }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include "paddle/platform/enforce.h"
namespace paddle {
namespace framework {
enum DataLayout {
kNHWC = 0,
kNCHW = 1,
kAnyLayout = 2,
};
inline DataLayout StringToDataLayout(const std::string& str) {
if (str == "NHWC" || str == "nhwc") {
return DataLayout::kNHWC;
} else if (str == "NCHW" || str == "nchw") {
return DataLayout::kNCHW;
} else {
PADDLE_THROW("Unknown storage order string: %s", str);
}
}
inline std::string DataLayoutToString(const DataLayout& data_layout) {
switch (data_layout) {
case kNHWC:
return "NHWC";
case kNCHW:
return "NCHW";
case kAnyLayout:
return "ANY_LAYOUT";
default:
PADDLE_THROW("unknown DataLayou %d", data_layout);
}
}
inline std::ostream& operator<<(std::ostream& out, DataLayout l) {
out << DataLayoutToString(l);
return out;
}
} // namespace framework
} // namespace paddle
...@@ -33,13 +33,7 @@ namespace framework { ...@@ -33,13 +33,7 @@ namespace framework {
const std::string kFeedOpType = "feed"; const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch"; const std::string kFetchOpType = "fetch";
DeviceContextPool* DeviceContextPool::pool = nullptr; Executor::Executor(const platform::Place& place) : place_(place) {}
Executor::Executor(const std::vector<platform::Place>& places) {
DeviceContextPool& pool = DeviceContextPool::Get();
auto borrowed_contexts = pool.Borrow(places);
device_contexts_.swap(borrowed_contexts);
}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
if (var_type == proto::VarDesc::LOD_TENSOR) { if (var_type == proto::VarDesc::LOD_TENSOR) {
...@@ -65,47 +59,48 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { ...@@ -65,47 +59,48 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
} }
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope) { bool create_local_scope, bool create_vars) {
// TODO(tonyyang-svail): // TODO(tonyyang-svail):
// - only runs on the first device (i.e. no interdevice communication) // - only runs on the first device (i.e. no interdevice communication)
// - will change to use multiple blocks for RNN op and Cond Op // - will change to use multiple blocks for RNN op and Cond Op
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size()); PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
auto& block = pdesc.Block(block_id); auto& block = pdesc.Block(block_id);
auto& device = device_contexts_[0];
Scope* local_scope = scope; Scope* local_scope = scope;
if (create_local_scope) { if (create_vars) {
local_scope = &scope->NewScope(); if (create_local_scope) {
for (auto& var : block.AllVars()) { local_scope = &scope->NewScope();
if (var->Name() == framework::kEmptyVarName) { for (auto& var : block.AllVars()) {
continue; if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
} }
} else {
if (var->Persistable()) { for (auto& var : block.AllVars()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name()); auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType()); CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name() VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< " locally, which pointer is " << ptr; << ptr;
} }
} } // if (create_local_scope)
} else { } // if (create_vars)
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
}
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugString(); VLOG(3) << op->DebugString();
op->Run(*local_scope, *device); op->Run(*local_scope, place_);
} }
if (create_local_scope) { if (create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
......
...@@ -14,9 +14,6 @@ limitations under the License. */ ...@@ -14,9 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <unordered_map>
#include "paddle/framework/op_info.h" #include "paddle/framework/op_info.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
...@@ -26,86 +23,13 @@ limitations under the License. */ ...@@ -26,86 +23,13 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class DeviceContextPool {
public:
static DeviceContextPool& Get() {
PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
return *pool;
}
static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
return *pool;
}
std::vector<const platform::DeviceContext*> Borrow(
const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
std::vector<const platform::DeviceContext*> borrowed_contexts;
for (auto& place : places) {
auto range = device_contexts_.equal_range(place);
if (range.first == range.second) {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
// TODO(dzhwinter) : assign the first found device. Will enhanced later.
// device load balancer maybe useful here.
borrowed_contexts.emplace_back(range.first->second);
}
return borrowed_contexts;
}
explicit DeviceContextPool(const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) {
if (platform::is_cpu_place(places[i])) {
device_contexts_.emplace(
places[i], new platform::CPUDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
} else if (platform::is_gpu_place(places[i])) {
#ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(
places[i], new platform::CUDADeviceContext(
boost::get<platform::GPUPlace>(places[i])));
#else
PADDLE_THROW(
"'GPUPlace' is not supported, Please re-compile with WITH_GPU "
"option");
#endif
}
}
}
~DeviceContextPool() {}
private:
static DeviceContextPool* pool;
struct Hash {
std::hash<int> hash_;
size_t operator()(const platform::Place& place) const {
return hash_(place.which());
}
};
std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
Hash>
device_contexts_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
};
class Executor { class Executor {
public: public:
// TODO(dzhwinter) : Do not rely on this function, it will be removed // TODO(dzhwinter) : Do not rely on this function, it will be removed
explicit Executor(const platform::DeviceContext& device) explicit Executor(const platform::DeviceContext& device)
: Executor(std::vector<platform::Place>({device.GetPlace()})) {} : Executor(device.GetPlace()) {}
explicit Executor(const platform::Place& place)
: Executor(std::vector<platform::Place>({place})) {}
explicit Executor(const std::vector<platform::Place>& places); explicit Executor(const platform::Place& place);
/* @Brief /* @Brief
* Runtime evaluation of the given ProgramDesc under certain Scope * Runtime evaluation of the given ProgramDesc under certain Scope
...@@ -114,10 +38,11 @@ class Executor { ...@@ -114,10 +38,11 @@ class Executor {
* ProgramDesc * ProgramDesc
* Scope * Scope
*/ */
void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true); void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
bool create_vars = true);
private: private:
std::vector<const platform::DeviceContext*> device_contexts_; const platform::Place place_;
}; };
} // namespace framework } // namespace framework
......
...@@ -22,6 +22,14 @@ ...@@ -22,6 +22,14 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
/*
This functor class is responsible for creating the gradient ops for the given
operator fwd_op. After it is called (through operator()), the pairs of
(gradient variable, corresponding input variable of fwd_op) will be added to
grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
gradient varialbe will be ignored or kEmptyVarName depending on the template
argument DropEmptyIG in the derived classes.
*/
class GradOpDescMakerBase { class GradOpDescMakerBase {
public: public:
explicit GradOpDescMakerBase( explicit GradOpDescMakerBase(
...@@ -56,6 +64,16 @@ class GradOpDescMakerBase { ...@@ -56,6 +64,16 @@ class GradOpDescMakerBase {
if (!drop_empty_grad) { if (!drop_empty_grad) {
return ret_val; return ret_val;
} }
PADDLE_ENFORCE_LE(var_names.size(), 1UL,
"BUG from operator developer:"
" for input argument with a list of variables, "
" drop_empty_grad is not allowed because it makes"
" the correspondence bewteen a variable and its gradient"
" ambiguous. Use REGISTER_OP_EX to register the op"
" or call InputGrad(?,false) in GradOpDescMaker."
" Op type %s",
fwd_op_.Type());
std::vector<std::string> dropped_ret_val; std::vector<std::string> dropped_ret_val;
dropped_ret_val.reserve(ret_val.size()); dropped_ret_val.reserve(ret_val.size());
std::copy_if(ret_val.begin(), ret_val.end(), std::copy_if(ret_val.begin(), ret_val.end(),
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include "paddle/framework/executor.h"
#include "paddle/framework/init.h" #include "paddle/framework/init.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/string/piece.h" #include "paddle/string/piece.h"
...@@ -48,13 +48,13 @@ bool InitDevices(const std::vector<std::string> &devices) { ...@@ -48,13 +48,13 @@ bool InitDevices(const std::vector<std::string> &devices) {
std::vector<platform::Place> places; std::vector<platform::Place> places;
for (auto &device : devices) { for (auto &device : devices) {
auto p = string::Piece(device); auto p = string::Piece(device);
if (string::Find(p, ':', 0) == string::Piece::npos) { if (string::HasPrefix(p, "CPU")) {
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
} else if (string::HasPrefix(p, "GPU")) { } else if (string::HasPrefix(p, "GPU")) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto pos = string::RFind(p, ':', string::Piece::npos); auto pos = string::RFind(p, ':', string::Piece::npos);
auto number = device.substr(pos + 1); auto number = device.substr(pos + 1);
places.emplace_back(platform::GPUPlace(std::stoi(number))); places.emplace_back(platform::CUDAPlace(std::stoi(number)));
#else #else
LOG(WARNING) LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option"; << "'GPU' is not supported, Please re-compile with WITH_GPU option";
...@@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) { ...@@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) {
return platform::is_cpu_place(place); return platform::is_cpu_place(place);
}) == places.end()) { }) == places.end()) {
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
LOG(WARNING) << "Not specified any device, use CPU by Default."; LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
} }
DeviceContextPool::Create(places); platform::DeviceContextPool::Create(places);
return true;
return true; return true;
} }
......
...@@ -23,5 +23,9 @@ TEST(Init, InitDevices) { ...@@ -23,5 +23,9 @@ TEST(Init, InitDevices) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"}; std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
ASSERT_EQ(InitDevices(ds2), true); ASSERT_EQ(InitDevices(ds2), true);
// test re-init
std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
ASSERT_EQ(InitDevices(ds3), true);
#endif #endif
} }
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace framework {
// For more details about the design of LibraryType, Please refer to
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
enum LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 };
inline std::string LibraryTypeToString(const LibraryType& library_type) {
switch (library_type) {
case kPlain:
return "PLAIN";
case kMKLDNN:
return "MKLDNN";
case kCUDNN:
return "CUDNN";
default:
PADDLE_THROW("unknown LibraryType %d", library_type);
}
}
inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
out << LibraryTypeToString(l);
return out;
}
} // namespace
} // framework
...@@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, ...@@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
while (size != 0) { while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size)); size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(), memory::Copy(cpu, buf.get(),
boost::get<platform::GPUPlace>(tensor.place()), boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write, reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream()); gpu_dev_ctx.stream());
gpu_dev_ctx.Wait(); gpu_dev_ctx.Wait();
......
...@@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) { ...@@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) {
TEST(LoDTensor, LoDInGPU) { TEST(LoDTensor, LoDInGPU) {
paddle::framework::LoDTensor lod_tensor; paddle::framework::LoDTensor lod_tensor;
paddle::platform::GPUPlace place(0); paddle::platform::CUDAPlace place(0);
paddle::framework::LoD src_lod; paddle::framework::LoD src_lod;
src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14}); src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
......
...@@ -127,7 +127,9 @@ class OpDesc { ...@@ -127,7 +127,9 @@ class OpDesc {
} }
proto::OpDesc desc_; proto::OpDesc desc_;
// input arg name => output variable names
VariableNameMap inputs_; VariableNameMap inputs_;
// output arg name => output variable names
VariableNameMap outputs_; VariableNameMap outputs_;
AttributeMap attrs_; AttributeMap attrs_;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/data_layout.h"
#include "paddle/framework/data_type.h"
#include "paddle/framework/library_type.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
struct OpKernelType {
struct Hash {
size_t operator()(const OpKernelType& key) const {
int place = key.place_.which() + (1 << LEFT_SHIFT);
int data_type =
static_cast<int>(key.data_type_) + (1 << (LEFT_SHIFT + 1));
int data_layout =
static_cast<int>(key.data_layout_) + (1 << (LEFT_SHIFT + 2));
int library_type =
static_cast<int>(key.library_type_) + (1 << (LEFT_SHIFT + 3));
std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type);
}
};
// place, data_type, library_type kinds less than 2^8
constexpr static int LEFT_SHIFT = 8;
proto::DataType data_type_;
DataLayout data_layout_;
platform::Place place_;
LibraryType library_type_;
OpKernelType(proto::DataType data_type, platform::Place place,
DataLayout data_layout = DataLayout::kAnyLayout,
LibraryType library_type = LibraryType::kPlain)
: data_type_(data_type),
data_layout_(data_layout),
place_(place),
library_type_(library_type) {}
OpKernelType(proto::DataType data_type,
const platform::DeviceContext& dev_ctx,
DataLayout data_layout = DataLayout::kAnyLayout,
LibraryType library_type = LibraryType::kPlain)
: data_type_(data_type),
data_layout_(data_layout),
place_(dev_ctx.GetPlace()),
library_type_(library_type) {}
bool operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_;
}
};
inline std::ostream& operator<<(std::ostream& os,
const OpKernelType& kernel_key) {
os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
<< kernel_key.data_layout_ << "]:place[" << kernel_key.place_
<< "]:library_type[" << kernel_key.library_type_ << "]";
return os;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_kernel_type.h"
#include <gtest/gtest.h>
#include <iostream>
TEST(OpKernelType, ToString) {
using OpKernelType = paddle::framework::OpKernelType;
using DataType = paddle::framework::proto::DataType;
using CPUPlace = paddle::platform::CPUPlace;
using DataLayout = paddle::framework::DataLayout;
using LibraryType = paddle::framework::LibraryType;
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
std::ostringstream stream;
stream << op_kernel_type;
ASSERT_EQ(
stream.str(),
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
}
TEST(OpKernelType, Hash) {
using OpKernelType = paddle::framework::OpKernelType;
using DataType = paddle::framework::proto::DataType;
using CPUPlace = paddle::platform::CPUPlace;
using CUDAPlace = paddle::platform::CUDAPlace;
using DataLayout = paddle::framework::DataLayout;
using LibraryType = paddle::framework::LibraryType;
OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
LibraryType::kCUDNN);
OpKernelType::Hash hasher;
ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
}
\ No newline at end of file
...@@ -61,17 +61,6 @@ struct OperatorRegistrar : public Registrar { ...@@ -61,17 +61,6 @@ struct OperatorRegistrar : public Registrar {
class OpRegistry { class OpRegistry {
public: public:
template <typename OpType, typename ProtoMakerType, typename GradOpType>
static void RegisterOp(const std::string& op_type,
const std::string& grad_op_type) {
OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
reg.info.grad_op_type_ = grad_op_type;
// register gradient op
if (!grad_op_type.empty()) {
OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
}
}
static std::unique_ptr<OperatorBase> CreateOp(const std::string& type, static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
const VariableNameMap& inputs, const VariableNameMap& inputs,
const VariableNameMap& outputs, const VariableNameMap& outputs,
...@@ -126,6 +115,14 @@ class OpKernelRegistrar : public Registrar { ...@@ -126,6 +115,14 @@ class OpKernelRegistrar : public Registrar {
__test_global_namespace_##uniq_name##__>::value, \ __test_global_namespace_##uniq_name##__>::value, \
msg) msg)
/*
The variadic arguments should be class types derived from one of the
following classes:
OpProtoAndCheckerMaker
GradOpDescMakerBase
VarTypeInference
InferShapeBase
*/
#define REGISTER_OPERATOR(op_type, op_class, ...) \ #define REGISTER_OPERATOR(op_type, op_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, \ __reg_op__##op_type, \
...@@ -144,20 +141,29 @@ class OpKernelRegistrar : public Registrar { ...@@ -144,20 +141,29 @@ class OpKernelRegistrar : public Registrar {
} }
/** /**
* Macro to register Operator. * Macro to register Operator. When the input is duplicable, you should
* use REGISTER_OP_EX with deop_empty_grad=false instead.
*/ */
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \ #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \ grad_op_class) \
REGISTER_OPERATOR(grad_op_type, grad_op_class); \ REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \
class _GradOpDescMaker_##grad_op_type##_ \ grad_op_class, true)
: public ::paddle::framework::DefaultGradOpDescMaker<true> { \
using ::paddle::framework::DefaultGradOpDescMaker< \ // When an argument is duplicable, we need to use this version.
true>::DefaultGradOpDescMaker; \ // Perhaps we can omit DropEmptyIG template parameter and
\ // only have one version of REGISTER_OP.
protected: \ #define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \
virtual std::string GradOpType() const { return #grad_op_type; } \ grad_op_class, drop_empty_grad) \
}; \ REGISTER_OPERATOR(grad_op_type, grad_op_class); \
REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \ class _GradOpDescMaker_##grad_op_type##_ \
: public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
using ::paddle::framework::DefaultGradOpDescMaker< \
drop_empty_grad>::DefaultGradOpDescMaker; \
\
protected: \
virtual std::string GradOpType() const { return #grad_op_type; } \
}; \
REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
op_maker_class); op_maker_class);
#define REGISTER_OP_WITH_KERNEL(op_type, ...) \ #define REGISTER_OP_WITH_KERNEL(op_type, ...) \
...@@ -182,7 +188,7 @@ class OpKernelRegistrar : public Registrar { ...@@ -182,7 +188,7 @@ class OpKernelRegistrar : public Registrar {
} }
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
......
...@@ -8,8 +8,7 @@ namespace framework { ...@@ -8,8 +8,7 @@ namespace framework {
class CosineOp : public OperatorBase { class CosineOp : public OperatorBase {
public: public:
using OperatorBase::OperatorBase; using OperatorBase::OperatorBase;
void Run(const Scope& scope, void Run(const Scope& scope, const platform::Place& place) const override {}
const platform::DeviceContext& dev_ctx) const override {}
}; };
class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
...@@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { ...@@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
class MyTestOp : public OperatorBase { class MyTestOp : public OperatorBase {
public: public:
using OperatorBase::OperatorBase; using OperatorBase::OperatorBase;
void Run(const Scope& scope, void Run(const Scope& scope, const platform::Place& place) const override {}
const platform::DeviceContext& dev_ctx) const override {}
}; };
class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
...@@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) { ...@@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) {
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::Scope scope; paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx; paddle::platform::CPUPlace cpu_place;
op->Run(scope, dev_ctx); op->Run(scope, cpu_place);
float scale_get = op->Attr<float>("scale"); float scale_get = op->Attr<float>("scale");
ASSERT_EQ(scale_get, scale); ASSERT_EQ(scale_get, scale);
} }
...@@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) { ...@@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::framework::Scope scope; paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx; paddle::platform::CPUPlace cpu_place;
op->Run(scope, dev_ctx); op->Run(scope, cpu_place);
ASSERT_EQ(op->Attr<float>("scale"), 1.0); ASSERT_EQ(op->Attr<float>("scale"), 1.0);
} }
...@@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) { ...@@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) {
attr->set_type(paddle::framework::proto::AttrType::INT); attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(4); attr->set_i(4);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::platform::CPUDeviceContext dev_ctx; paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope; paddle::framework::Scope scope;
op->Run(scope, dev_ctx); op->Run(scope, cpu_place);
int test_attr = op->Attr<int>("test_attr"); int test_attr = op->Attr<int>("test_attr");
ASSERT_EQ(test_attr, 4); ASSERT_EQ(test_attr, 4);
} }
......
...@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/operator.h"
#include <algorithm> #include <algorithm>
#include <atomic> #include <atomic>
#include "paddle/framework/executor.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h" #include "paddle/framework/var_type.h"
...@@ -240,12 +242,6 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( ...@@ -240,12 +242,6 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
return res; return res;
} }
std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
<< "]";
return os;
}
bool OpSupportGPU(const std::string& op_type) { bool OpSupportGPU(const std::string& op_type) {
auto& all_kernels = OperatorWithKernel::AllOpKernels(); auto& all_kernels = OperatorWithKernel::AllOpKernels();
auto it = all_kernels.find(op_type); auto it = all_kernels.find(op_type);
...@@ -388,11 +384,11 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -388,11 +384,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
}; };
void OperatorWithKernel::Run(const Scope& scope, void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
ExecutionContext ctx(*this, scope, dev_ctx); auto dev_ctx = pool.Borrow(place);
// check if op[type] has kernel registered. // check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels(); auto& all_op_kernels = AllOpKernels();
...@@ -404,19 +400,30 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -404,19 +400,30 @@ void OperatorWithKernel::Run(const Scope& scope,
// check if op[type] have kernel for kernel_key // check if op[type] have kernel for kernel_key
OpKernelMap& kernels = kernels_iter->second; OpKernelMap& kernels = kernels_iter->second;
auto kernel_key = GetKernelType(ctx);
auto kernel_iter = kernels.find(kernel_key); ExecutionContext ctx(*this, scope, *dev_ctx);
auto actual_kernel_key = GetActualKernelType(ctx);
auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
auto kernel_iter = kernels.find(expected_kernel_key);
if (kernel_iter == kernels.end()) { if (kernel_iter == kernels.end()) {
PADDLE_THROW("The operator %s does not support %s", type_, kernel_key); PADDLE_THROW("The operator %s does not support %s", type_,
expected_kernel_key);
} }
kernel_iter->second->Compute(ctx); kernel_iter->second->Compute(ctx);
} }
OpKernelType OperatorWithKernel::GetKernelType(
OpKernelType OperatorWithKernel::GetActualKernelType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
} }
OpKernelType OperatorWithKernel::GetExpectedKernelType(
const OpKernelType& actual_kernel_type) const {
return actual_kernel_type;
}
proto::DataType OperatorWithKernel::IndicateDataType( proto::DataType OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
auto& scope = ctx.scope(); auto& scope = ctx.scope();
......
...@@ -23,15 +23,14 @@ limitations under the License. */ ...@@ -23,15 +23,14 @@ limitations under the License. */
#include "glog/logging.h" // For VLOG #include "glog/logging.h" // For VLOG
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_info.h" #include "paddle/framework/op_info.h"
#include "paddle/framework/op_kernel_type.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h" #include "paddle/framework/selected_rows.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
#include "paddle/platform/variant.h" #include "paddle/platform/variant.h"
#include "paddle/utils/Error.h" #include "paddle/utils/Error.h"
...@@ -53,6 +52,11 @@ constexpr char kGradVarSuffix[] = "@GRAD"; ...@@ -53,6 +52,11 @@ constexpr char kGradVarSuffix[] = "@GRAD";
/// Variables with this suffix are supposed to be filled up with zeros. /// Variables with this suffix are supposed to be filled up with zeros.
constexpr char kZeroVarSuffix[] = "@ZERO"; constexpr char kZeroVarSuffix[] = "@ZERO";
// define some kernel hint
const std::string kUseCPU = "use_cpu";
const std::string kUseCUDNN = "use_cudnn";
const std::string kUseMKLDNN = "use_mkldnn";
inline std::string GradVarName(const std::string& var_name) { inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix; return var_name + kGradVarSuffix;
} }
...@@ -83,8 +87,7 @@ class OperatorBase { ...@@ -83,8 +87,7 @@ class OperatorBase {
virtual std::string DebugString() const; virtual std::string DebugString() const;
/// Net will call this function to Run an op. /// Net will call this function to Run an op.
virtual void Run(const Scope& scope, virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
const platform::DeviceContext& dev_ctx) const = 0;
virtual bool IsNetOp() const { return false; } virtual bool IsNetOp() const { return false; }
...@@ -159,8 +162,7 @@ class OperatorBase { ...@@ -159,8 +162,7 @@ class OperatorBase {
class NOP : public OperatorBase { class NOP : public OperatorBase {
public: public:
using OperatorBase::OperatorBase; using OperatorBase::OperatorBase;
void Run(const Scope& scope, void Run(const Scope& scope, const platform::Place& place) const override {}
const platform::DeviceContext& dev_ctx) const override {}
std::unique_ptr<OperatorBase> Clone() const override { std::unique_ptr<OperatorBase> Clone() const override {
return std::unique_ptr<OperatorBase>(new NOP(*this)); return std::unique_ptr<OperatorBase>(new NOP(*this));
} }
...@@ -345,34 +347,6 @@ class OpKernel : public OpKernelBase { ...@@ -345,34 +347,6 @@ class OpKernel : public OpKernelBase {
using ELEMENT_TYPE = T; using ELEMENT_TYPE = T;
}; };
struct OpKernelType {
struct Hash {
std::hash<int> hash_;
size_t operator()(const OpKernelType& key) const {
int place = key.place_.which();
int data_type = static_cast<int>(key.data_type_);
int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
(place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
return hash_(pre_hash);
}
};
platform::Place place_;
proto::DataType data_type_;
OpKernelType(proto::DataType data_type, platform::Place place)
: place_(place), data_type_(data_type) {}
OpKernelType(proto::DataType data_type,
const platform::DeviceContext& dev_ctx)
: place_(dev_ctx.GetPlace()), data_type_(data_type) {}
bool operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_;
}
};
class OperatorWithKernel : public OperatorBase { class OperatorWithKernel : public OperatorBase {
public: public:
using OpKernelMap = using OpKernelMap =
...@@ -383,8 +357,7 @@ class OperatorWithKernel : public OperatorBase { ...@@ -383,8 +357,7 @@ class OperatorWithKernel : public OperatorBase {
const VariableNameMap& outputs, const AttributeMap& attrs) const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const Scope& scope, void Run(const Scope& scope, const platform::Place& place) const final;
const platform::DeviceContext& dev_ctx) const final;
static std::unordered_map<std::string /* op_type */, OpKernelMap>& static std::unordered_map<std::string /* op_type */, OpKernelMap>&
AllOpKernels() { AllOpKernels() {
...@@ -405,7 +378,9 @@ class OperatorWithKernel : public OperatorBase { ...@@ -405,7 +378,9 @@ class OperatorWithKernel : public OperatorBase {
} }
protected: protected:
virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const;
virtual OpKernelType GetExpectedKernelType(
const OpKernelType& actual_kernel_type) const;
private: private:
// indicate kernel DataType by input data. Defaultly all input data must be // indicate kernel DataType by input data. Defaultly all input data must be
...@@ -413,8 +388,6 @@ class OperatorWithKernel : public OperatorBase { ...@@ -413,8 +388,6 @@ class OperatorWithKernel : public OperatorBase {
proto::DataType IndicateDataType(const ExecutionContext& ctx) const; proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
}; };
std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
extern bool OpSupportGPU(const std::string& op_type); extern bool OpSupportGPU(const std::string& op_type);
} // namespace framework } // namespace framework
......
...@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/operator.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/framework/op_info.h" #include "paddle/framework/op_info.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase { ...@@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase {
OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs, OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs) const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), x(1) {} : OperatorBase(type, inputs, outputs, attrs), x(1) {}
void Run(const Scope& scope, void Run(const Scope& scope, const platform::Place& place) const override {
const platform::DeviceContext& dev_ctx) const override {
++op_run_num; ++op_run_num;
ASSERT_EQ(static_cast<int>(inputs_.size()), 1); ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
ASSERT_EQ(static_cast<int>(outputs_.size()), 1); ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
...@@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase { ...@@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase {
int x{0}; int x{0};
}; };
class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
public: public:
OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto, OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of test op"); AddInput("input", "input of test op");
AddOutput("output", "output of test op"); AddOutput("output", "output of test op");
...@@ -65,11 +64,12 @@ static void BuildVar(const std::string& param_name, ...@@ -65,11 +64,12 @@ static void BuildVar(const std::string& param_name,
} }
} }
REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OP_WITHOUT_GRADIENT(test_operator,
test_operator, paddle::framework::OpWithoutKernelTest, paddle::framework::OpWithoutKernelTest,
paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker); paddle::framework::OpWithoutKernelCheckerMaker);
TEST(OperatorBase, all) { TEST(OperatorBase, all) {
paddle::framework::InitDevices({"CPU"});
paddle::framework::proto::OpDesc op_desc; paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("test_operator"); op_desc.set_type("test_operator");
BuildVar("input", {"IN1"}, op_desc.add_inputs()); BuildVar("input", {"IN1"}, op_desc.add_inputs());
...@@ -80,13 +80,13 @@ TEST(OperatorBase, all) { ...@@ -80,13 +80,13 @@ TEST(OperatorBase, all) {
attr->set_type(paddle::framework::proto::AttrType::FLOAT); attr->set_type(paddle::framework::proto::AttrType::FLOAT);
attr->set_f(3.14); attr->set_f(3.14);
paddle::platform::CPUDeviceContext device_context; paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope; paddle::framework::Scope scope;
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
scope.Var("OUT1"); scope.Var("OUT1");
ASSERT_EQ(paddle::framework::op_run_num, 0); ASSERT_EQ(paddle::framework::op_run_num, 0);
op->Run(scope, device_context); op->Run(scope, cpu_place);
ASSERT_EQ(paddle::framework::op_run_num, 1); ASSERT_EQ(paddle::framework::op_run_num, 1);
} }
...@@ -114,7 +114,7 @@ class OpWithKernelTest : public OperatorWithKernel { ...@@ -114,7 +114,7 @@ class OpWithKernelTest : public OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override {} void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetKernelType(const ExecutionContext& ctx) const override { OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override {
return OpKernelType(proto::DataType::FP32, ctx.GetPlace()); return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
} }
}; };
...@@ -123,7 +123,6 @@ template <typename T1, typename T2> ...@@ -123,7 +123,6 @@ template <typename T1, typename T2>
class CPUKernelTest : public OpKernel<float> { class CPUKernelTest : public OpKernel<float> {
public: public:
void Compute(const ExecutionContext& ctx) const { void Compute(const ExecutionContext& ctx) const {
std::cout << "this is cpu kernel" << std::endl;
std::cout << ctx.op().DebugString() << std::endl; std::cout << ctx.op().DebugString() << std::endl;
cpu_kernel_run_num++; cpu_kernel_run_num++;
ASSERT_EQ(ctx.op().Input("x"), "IN1"); ASSERT_EQ(ctx.op().Input("x"), "IN1");
...@@ -195,6 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel, ...@@ -195,6 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
// test with single input // test with single input
TEST(OpKernel, all) { TEST(OpKernel, all) {
paddle::framework::InitDevices({"CPU"});
paddle::framework::proto::OpDesc op_desc; paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_with_kernel"); op_desc.set_type("op_with_kernel");
BuildVar("x", {"IN1"}, op_desc.add_inputs()); BuildVar("x", {"IN1"}, op_desc.add_inputs());
...@@ -205,12 +205,12 @@ TEST(OpKernel, all) { ...@@ -205,12 +205,12 @@ TEST(OpKernel, all) {
attr->set_type(paddle::framework::proto::AttrType::FLOAT); attr->set_type(paddle::framework::proto::AttrType::FLOAT);
attr->set_f(3.14); attr->set_f(3.14);
paddle::platform::CPUDeviceContext cpu_device_context; paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope; paddle::framework::Scope scope;
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
op->Run(scope, cpu_device_context); op->Run(scope, cpu_place);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
} }
...@@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, ...@@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
TEST(OpKernel, multi_inputs) { TEST(OpKernel, multi_inputs) {
using namespace paddle::framework; using namespace paddle::framework;
paddle::framework::InitDevices({"CPU"});
proto::OpDesc op_desc; proto::OpDesc op_desc;
op_desc.set_type("op_multi_inputs_with_kernel"); op_desc.set_type("op_multi_inputs_with_kernel");
BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs()); BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
BuildVar("k", {"k0"}, op_desc.add_inputs()); BuildVar("k", {"k0"}, op_desc.add_inputs());
...@@ -235,7 +237,7 @@ TEST(OpKernel, multi_inputs) { ...@@ -235,7 +237,7 @@ TEST(OpKernel, multi_inputs) {
attr->set_type(paddle::framework::proto::AttrType::FLOAT); attr->set_type(paddle::framework::proto::AttrType::FLOAT);
attr->set_f(3.14); attr->set_f(3.14);
paddle::platform::CPUDeviceContext cpu_device_context; paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope; paddle::framework::Scope scope;
scope.Var("x0")->GetMutable<LoDTensor>(); scope.Var("x0")->GetMutable<LoDTensor>();
scope.Var("x1")->GetMutable<LoDTensor>(); scope.Var("x1")->GetMutable<LoDTensor>();
...@@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) { ...@@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) {
scope.Var("y1")->GetMutable<LoDTensor>(); scope.Var("y1")->GetMutable<LoDTensor>();
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
op->Run(scope, cpu_device_context); op->Run(scope, cpu_place);
} }
class OperatorClone : public paddle::framework::OperatorBase { class OperatorClone : public paddle::framework::OperatorBase {
...@@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase { ...@@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase {
const paddle::framework::AttributeMap& attrs) const paddle::framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const paddle::framework::Scope& scope, void Run(const paddle::framework::Scope& scope,
const paddle::platform::DeviceContext& dev_ctx) const override {} const paddle::platform::Place& place) const override {}
}; };
TEST(Operator, Clone) { TEST(Operator, Clone) {
paddle::framework::InitDevices({"CPU"});
OperatorClone a("ABC", paddle::framework::VariableNameMap{}, OperatorClone a("ABC", paddle::framework::VariableNameMap{},
paddle::framework::VariableNameMap{}, paddle::framework::VariableNameMap{},
paddle::framework::AttributeMap{}); paddle::framework::AttributeMap{});
......
...@@ -71,7 +71,7 @@ private: ...@@ -71,7 +71,7 @@ private:
``` ```
```c++ ```c++
typedef boost::variant<GpuPlace, CpuPlace> Place; typedef boost::variant<CUDAPlace, CpuPlace> Place;
typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar; Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
typedef boost::variant< typedef boost::variant<
......
...@@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { ...@@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
boost::get<platform::CPUPlace>(place), size, type)); boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
} }
#else #else
holder_.reset(new PlaceholderImpl<platform::GPUPlace>( holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::GPUPlace>(place), size, type)); boost::get<platform::CUDAPlace>(place), size, type));
} }
#endif #endif
offset_ = 0; offset_ = 0;
......
...@@ -80,20 +80,20 @@ TEST(Tensor, MutableData) { ...@@ -80,20 +80,20 @@ TEST(Tensor, MutableData) {
float* p1 = nullptr; float* p1 = nullptr;
float* p2 = nullptr; float* p2 = nullptr;
// initialization // initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace()); p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace());
EXPECT_NE(p1, nullptr); EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size // set src_tensor a new dim with large size
// momery is supposed to be re-allocated // momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace()); p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace());
EXPECT_NE(p2, nullptr); EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2); EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size // set src_tensor a new dim with same size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace()); p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size // set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace()); p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#endif #endif
...@@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) { ...@@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) {
{ {
Tensor src_tensor; Tensor src_tensor;
Tensor dst_tensor; Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace()); src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace());
dst_tensor.ShareDataWith(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
...@@ -166,7 +166,7 @@ TEST(Tensor, Slice) { ...@@ -166,7 +166,7 @@ TEST(Tensor, Slice) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace()); src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace());
Tensor slice_tensor = src_tensor.Slice(2, 6); Tensor slice_tensor = src_tensor.Slice(2, 6);
DDim slice_dims = slice_tensor.dims(); DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2); ASSERT_EQ(arity(slice_dims), 2);
...@@ -176,11 +176,11 @@ TEST(Tensor, Slice) { ...@@ -176,11 +176,11 @@ TEST(Tensor, Slice) {
uintptr_t src_data_address = uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>()); reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace())); src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace()));
uintptr_t slice_data_address = uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>()); reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace())); slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
......
...@@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, ...@@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place); auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy( memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
...@@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, ...@@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
} else if (platform::is_cpu_place(src_place) && } else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place); auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy( memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::GPUPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy( memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
...@@ -82,6 +82,28 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, ...@@ -82,6 +82,28 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
#endif #endif
} }
/**
* @brief CopyFrom support CPU <-> CPU
*/
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
src.check_memory_size();
dst->Resize(src.dims());
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type());
PADDLE_ENFORCE(platform::is_cpu_place(src_place) &&
platform::is_cpu_place(dst_place));
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
/** /**
* @brief Copy the content of an external vector to a tensor. * @brief Copy the content of an external vector to a tensor.
* *
...@@ -108,13 +130,28 @@ inline void CopyFromVector(const std::vector<T>& src, ...@@ -108,13 +130,28 @@ inline void CopyFromVector(const std::vector<T>& src,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr, boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
size, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
} }
/**
* @brief CopyFromVector CPU vector -> CPU Tensor
*/
template <typename T>
inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
platform::CPUPlace dst_place = platform::CPUPlace();
auto src_ptr = static_cast<const void*>(src.data());
platform::CPUPlace src_place;
dst->Resize({static_cast<int64_t>(src.size())});
auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
auto size = src.size() * sizeof(T);
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
/** /**
* @brief Copy the content of a tensor to a vector * @brief Copy the content of a tensor to a vector
* *
...@@ -141,12 +178,30 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, ...@@ -141,12 +178,30 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
src_ptr, size, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} }
#endif #endif
} }
/**
* @brief CopyToVector CPUTensor <-> CPU Vector
*/
template <typename T>
inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
auto src_ptr = static_cast<const void*>(src.data<T>());
auto size = src.numel() * sizeof(T);
platform::CPUPlace dst_place;
dst->resize(src.numel());
auto dst_ptr = static_cast<void*>(dst->data());
PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
src_ptr, size);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
TEST(CopyFrom, Tensor) { TEST(CopyFrom, Tensor) {
Tensor src_tensor; Tensor src_tensor;
Tensor dst_tensor; Tensor dst_tensor;
...@@ -29,7 +30,7 @@ TEST(CopyFrom, Tensor) { ...@@ -29,7 +30,7 @@ TEST(CopyFrom, Tensor) {
memcpy(src_ptr, arr, 9 * sizeof(int)); memcpy(src_ptr, arr, 9 * sizeof(int));
auto cpu_place = new platform::CPUPlace(); auto cpu_place = new platform::CPUPlace();
CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor); CopyFrom(src_tensor, *cpu_place, &dst_tensor);
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
...@@ -58,7 +59,7 @@ TEST(CopyFrom, Tensor) { ...@@ -58,7 +59,7 @@ TEST(CopyFrom, Tensor) {
memcpy(src_ptr, arr, 9 * sizeof(int)); memcpy(src_ptr, arr, 9 * sizeof(int));
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new platform::GPUPlace(0); auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
...@@ -104,8 +105,7 @@ TEST(CopyFromVector, Tensor) { ...@@ -104,8 +105,7 @@ TEST(CopyFromVector, Tensor) {
// Copy to CPU Tensor // Copy to CPU Tensor
cpu_tensor.Resize(make_ddim({3, 3})); cpu_tensor.Resize(make_ddim({3, 3}));
auto cpu_place = new paddle::platform::CPUPlace(); auto cpu_place = new paddle::platform::CPUPlace();
CPUDeviceContext cpu_ctx(*cpu_place); CopyFromVector<int>(src_vec, &cpu_tensor);
CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
// Compare Tensors // Compare Tensors
const int* cpu_ptr = cpu_tensor.data<int>(); const int* cpu_ptr = cpu_tensor.data<int>();
...@@ -117,7 +117,7 @@ TEST(CopyFromVector, Tensor) { ...@@ -117,7 +117,7 @@ TEST(CopyFromVector, Tensor) {
src_vec.erase(src_vec.begin(), src_vec.begin() + 5); src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
cpu_tensor.Resize(make_ddim({2, 2})); cpu_tensor.Resize(make_ddim({2, 2}));
CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor); CopyFromVector<int>(src_vec, &cpu_tensor);
cpu_ptr = cpu_tensor.data<int>(); cpu_ptr = cpu_tensor.data<int>();
src_ptr = src_vec.data(); src_ptr = src_vec.data();
ASSERT_NE(src_ptr, cpu_ptr); ASSERT_NE(src_ptr, cpu_ptr);
...@@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) { ...@@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) {
// Copy to GPUTensor // Copy to GPUTensor
gpu_tensor.Resize(make_ddim({3, 3})); gpu_tensor.Resize(make_ddim({3, 3}));
auto gpu_place = new paddle::platform::GPUPlace(); auto gpu_place = new paddle::platform::CUDAPlace();
CUDADeviceContext gpu_ctx(*gpu_place); CUDADeviceContext gpu_ctx(*gpu_place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison // Copy from GPU to CPU tensor for comparison
...@@ -198,9 +198,8 @@ TEST(CopyToVector, Tensor) { ...@@ -198,9 +198,8 @@ TEST(CopyToVector, Tensor) {
} }
CPUPlace place; CPUPlace place;
CPUDeviceContext cpu_ctx(place);
std::vector<int> dst; std::vector<int> dst;
CopyToVector<int>(src, cpu_ctx, &dst); CopyToVector<int>(src, &dst);
for (int i = 0; i < 3 * 3; ++i) { for (int i = 0; i < 3 * 3; ++i) {
EXPECT_EQ(src_ptr[i], dst[i]); EXPECT_EQ(src_ptr[i], dst[i]);
...@@ -210,7 +209,7 @@ TEST(CopyToVector, Tensor) { ...@@ -210,7 +209,7 @@ TEST(CopyToVector, Tensor) {
{ {
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
Tensor gpu_tensor; Tensor gpu_tensor;
GPUPlace place; CUDAPlace place;
CUDADeviceContext gpu_ctx(place); CUDADeviceContext gpu_ctx(place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <iostream>
#include <mutex>
#include <queue>
#include <thread>
#include "paddle/platform/call_once.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace framework {
typedef std::function<void()> Task;
class ThreadPool {
public:
/**
* @brief Get a instance of threadpool, the thread number will
* be specified as the number of hardware thread contexts
*/
static ThreadPool* GetInstance() {
std::call_once(init_flag, &ThreadPool::Init);
return threadpool.get();
}
~ThreadPool() {
{
// notify all threads to stop running
running_ = false;
scheduled_.notify_all();
}
for (auto& t : threads_) {
t->join();
t.reset(nullptr);
}
}
int GetNumThreads() const { return num_threads_; }
int GetAvailable() {
std::unique_lock<std::mutex> lock(mutex_);
return available_;
}
/**
* @brief Push a function to the queue, and will be scheduled and
* executed if a thread is available.
* @param[in] Task will be pushed to the task queue.
*/
void Run(const Task& fn) {
std::unique_lock<std::mutex> lock(mutex_);
tasks_.push(fn);
lock.unlock();
scheduled_.notify_one();
}
/**
* @brief Wait until all the tasks are completed.
*/
void Wait() {
std::unique_lock<std::mutex> lock(mutex_);
completed_.wait(lock, [=] { return Done() == true; });
}
private:
ThreadPool& operator=(const ThreadPool&) = delete;
ThreadPool(const ThreadPool&) = delete;
ThreadPool(int num_threads)
: num_threads_(num_threads), available_(num_threads), running_(true) {
threads_.resize(num_threads);
for (auto& thread : threads_) {
// TODO(Yancey1989): binding the thread on the specify CPU number
thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
}
}
/**
* @brief If the task queue is empty and avaialbe
* is equal to the number of threads, means that
* all tasks are completed.
*
* Note: this function is not thread-safe.
*
* @return true if all tasks are completed.
*/
bool Done() { return tasks_.empty() && available_ == num_threads_; }
void TaskLoop() {
while (running_) {
std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
if (!running_) {
break;
}
// pop a task from the task queue
auto task = tasks_.front();
tasks_.pop();
--available_;
lock.unlock();
// run the task
task();
{
std::unique_lock<std::mutex> lock(mutex_);
++available_;
if (Done()) {
completed_.notify_all();
}
}
}
}
static void Init() {
if (threadpool.get() == nullptr) {
// TODO(Yancey1989): specify the max threads number
int num_threads = std::thread::hardware_concurrency();
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool.reset(new ThreadPool(num_threads));
}
}
private:
static std::unique_ptr<ThreadPool> threadpool;
static std::once_flag init_flag;
int num_threads_;
int available_;
bool running_;
std::queue<Task> tasks_;
std::vector<std::unique_ptr<std::thread>> threads_;
std::mutex mutex_;
std::condition_variable scheduled_;
std::condition_variable completed_;
};
std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
std::once_flag ThreadPool::init_flag;
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "threadpool.h"
#include <gtest/gtest.h>
#include <atomic>
#include <chrono>
#include <map>
#include <thread>
namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
for (int i = 0; i < cnt; ++i) {
pool->Run([&sum]() { sum.fetch_add(1); });
}
}
TEST(ThreadPool, ConcurrentInit) {
framework::ThreadPool* pool;
int concurrent_cnt = 50;
std::vector<std::thread> threads;
for (int i = 0; i < concurrent_cnt; ++i) {
std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
threads.push_back(std::move(t));
}
for (auto& t : threads) {
t.join();
}
}
TEST(ThreadPool, ConcurrentStart) {
framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
std::atomic<int> sum(0);
std::vector<std::thread> threads;
int concurrent_cnt = 50;
// sum = (n * (n + 1)) / 2
for (int i = 1; i <= concurrent_cnt; ++i) {
std::thread t(do_sum, pool, std::ref(sum), i);
threads.push_back(std::move(t));
}
for (auto& t : threads) {
t.join();
}
pool->Wait();
EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2);
}
...@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024); ...@@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024);
To allocate 4KB memory on the 3rd GPU: To allocate 4KB memory on the 3rd GPU:
```cpp ```cpp
p = memory::Alloc(platform::GPUPlace(2), 4*1024); p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
``` ```
To free memory and check the so-far used amount of memory on a place: To free memory and check the so-far used amount of memory on a place:
```cpp ```cpp
auto pl = platform::GPUPlace(0); auto pl = platform::CUDAPlace(0);
p = memory::Alloc(pl, 4*1024); p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl); cout << memory::Used(pl);
memory::Free(pl, p); memory::Free(pl, p);
...@@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place); ...@@ -36,7 +36,7 @@ template <typename Place> size_t Used(Place);
} // namespace memory } // namespace memory
``` ```
These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`: These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
```cpp ```cpp
template<> template<>
...@@ -49,7 +49,7 @@ and ...@@ -49,7 +49,7 @@ and
```cpp ```cpp
template<> template<>
void Alloc<GPUPlace>(GPUPlace p, size_t size) { void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size); return GetGPUBuddyAllocator(p.id)->Alloc(size);
} }
``` ```
...@@ -122,7 +122,7 @@ There are two implementations of `Context`: ...@@ -122,7 +122,7 @@ There are two implementations of `Context`:
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory. 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. 1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
### Majel ### Majel
......
...@@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -28,31 +28,25 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <> template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place, void Copy<platform::CPUPlace, platform::CUDAPlace>(
void* dst, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
platform::GPUPlace src_place, const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} }
template <> template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place, void Copy<platform::CUDAPlace, platform::CPUPlace>(
void* dst, platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
platform::CPUPlace src_place, const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} }
template <> template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place, void Copy<platform::CUDAPlace, platform::CUDAPlace>(
void* dst, platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
platform::GPUPlace src_place, const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num,
cudaStream_t stream) {
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
...@@ -62,33 +56,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place, ...@@ -62,33 +56,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
} }
} }
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
}
template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
}
template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
}
#endif #endif
} // namespace memory } // namespace memory
......
...@@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
} }
template <> template <>
size_t Used<platform::GPUPlace>(platform::GPUPlace place) { size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used(); return GetGPUBuddyAllocator(place.device)->Used();
} }
template <> template <>
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) { void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device); auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size); auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) { if (ptr == nullptr) {
...@@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) { ...@@ -101,14 +101,14 @@ void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
LOG(WARNING) << "total " << total; LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place); LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} }
return ptr; return ptr;
} }
template <> template <>
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) { void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
} }
......
...@@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::GPUPlace place) { size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::Metadata);
size_t alignment = paddle::platform::GpuMinChunkSize(); size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
...@@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) { ...@@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) {
EXPECT_EQ(p, nullptr); EXPECT_EQ(p, nullptr);
paddle::platform::GPUPlace gpu(0); paddle::platform::CUDAPlace gpu(0);
p = paddle::memory::Alloc(gpu, 4096); p = paddle::memory::Alloc(gpu, 4096);
EXPECT_NE(p, nullptr); EXPECT_NE(p, nullptr);
...@@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) { ...@@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) {
} }
TEST(BuddyAllocator, GPUMultAlloc) { TEST(BuddyAllocator, GPUMultAlloc) {
paddle::platform::GPUPlace gpu; paddle::platform::CUDAPlace gpu;
std::unordered_map<void *, size_t> ps; std::unordered_map<void *, size_t> ps;
......
...@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()), framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
......
...@@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* inference = ctx.Input<Tensor>("Out"); auto* inference = ctx.Input<Tensor>("Out");
auto* indices = ctx.Input<Tensor>("Indices"); auto* indices = ctx.Input<Tensor>("Indices");
auto* label = ctx.Input<Tensor>("Label"); auto* label = ctx.Input<Tensor>("Label");
......
...@@ -13,59 +13,113 @@ See the License for the specific language governing permissions and ...@@ -13,59 +13,113 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h" #include <math.h> // for sqrt in CPU and CUDA
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h"
#include "paddle/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct AdamFunctor {
T beta1_;
T beta2_;
T epsilon_;
const T* beta1_pow_;
const T* beta2_pow_;
const T* moment1_;
T* moment1_out_;
const T* moment2_;
T* moment2_out_;
const T* lr_;
const T* grad_;
const T* param_;
T* param_out_;
AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
T* mom2_out, const T* lr, const T* grad, const T* param,
T* param_out)
: beta1_(beta1),
beta2_(beta2),
epsilon_(epsilon),
beta1_pow_(beta1_pow),
beta2_pow_(beta2_pow),
moment1_(mom1),
moment1_out_(mom1_out),
moment2_(mom2),
moment2_out_(mom2_out),
lr_(lr),
grad_(grad),
param_(param),
param_out_(param_out) {}
inline HOSTDEVICE void operator()(size_t i) const {
// Merge all memory access together.
T g = grad_[i];
T mom1 = moment1_[i];
T mom2 = moment2_[i];
T lr = *lr_;
T beta1_pow = *beta1_pow_;
T beta2_pow = *beta2_pow_;
T p = param_[i];
// Calculation
lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
// Write back to global memory
moment1_out_[i] = mom1;
moment2_out_[i] = mom2;
param_out_[i] = p;
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AdamOpKernel : public framework::OpKernel<T> { class AdamOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); using paddle::framework::LoDTensor;
auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out"); using paddle::operators::detail::Ref;
auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
param_out_tensor->mutable_data<T>(ctx.GetPlace());
moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
T beta1 = static_cast<T>(ctx.Attr<float>("beta1")); T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2")); T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
auto& lr =
Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate");
auto& beta1_pow =
Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow");
auto& beta2_pow =
Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow");
auto& param_out =
Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut");
auto& mom1_out =
Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
auto& mom2_out =
Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
auto param = framework::EigenVector<T>::Flatten( AdamFunctor<T> functor(beta1, beta2, epsilon, beta1_pow.template data<T>(),
*ctx.Input<framework::Tensor>("Param")); beta2_pow.template data<T>(),
auto grad = framework::EigenVector<T>::Flatten( mom1.template data<T>(),
*ctx.Input<framework::Tensor>("Grad")); mom1_out.template mutable_data<T>(ctx.GetPlace()),
auto moment1 = framework::EigenVector<T>::Flatten( mom2.template data<T>(),
*ctx.Input<framework::Tensor>("Moment1")); mom2_out.template mutable_data<T>(ctx.GetPlace()),
auto moment2 = framework::EigenVector<T>::Flatten( lr.template data<T>(), grad.template data<T>(),
*ctx.Input<framework::Tensor>("Moment2")); param.template data<T>(),
auto lr = framework::EigenVector<T>::Flatten( param_out.template mutable_data<T>(ctx.GetPlace()));
*ctx.Input<framework::Tensor>("LearningRate")); platform::ForRange<DeviceContext> for_range(
auto beta1_pow = framework::EigenVector<T>::Flatten( static_cast<const DeviceContext&>(ctx.device_context()), param.numel());
*ctx.Input<framework::Tensor>("Beta1Pow")); for_range(functor);
auto beta2_pow = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Beta2Pow"));
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad;
moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square();
// All of these are tensors of 1 element
auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
// Eigen does not support automatic broadcast
// Get dimensions of moment vector to broadcast lr_t
Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
param_out.device(*place) =
param -
lr_t.broadcast(m_dsize) *
(moment1_out / (moment2_out.sqrt() + epsilon));
} }
}; };
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase { ...@@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase {
protected: protected:
size_t GetOffset(const framework::Scope &scope, size_t GetOffset(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const { const platform::Place &place) const {
auto *i = scope.FindVar(Input("I")); auto *i = scope.FindVar(Input("I"));
PADDLE_ENFORCE(i != nullptr, "I must be set"); PADDLE_ENFORCE(i != nullptr, "I must be set");
auto &i_tensor = i->Get<framework::LoDTensor>(); auto &i_tensor = i->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
size_t offset; size_t offset;
if (platform::is_gpu_place(i_tensor.place())) { if (platform::is_gpu_place(i_tensor.place())) {
// FIXME: Avoid copy from GPU to CPU // FIXME: Avoid copy from GPU to CPU
......
...@@ -12,10 +12,12 @@ ...@@ -12,10 +12,12 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <numeric> #include <numeric>
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>(); auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
auto &rank_table = auto &rank_table =
scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>(); scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
...@@ -103,6 +105,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -103,6 +105,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
continue; continue;
} }
auto slice = out->Slice(out_offset, out_offset + len); auto slice = out->Slice(out_offset, out_offset + len);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice); dev_ctx, &slice);
out_offset += len; out_offset += len;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/framework/data_type.h" #include "paddle/framework/data_type.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/var_type.h" #include "paddle/framework/var_type.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase { ...@@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto *x = scope.FindVar(Input("X")); auto *x = scope.FindVar(Input("X"));
if (x == nullptr) { if (x == nullptr) {
return; return;
...@@ -80,6 +81,10 @@ class AssignOp : public framework::OperatorBase { ...@@ -80,6 +81,10 @@ class AssignOp : public framework::OperatorBase {
PADDLE_ENFORCE( PADDLE_ENFORCE(
out != nullptr, out != nullptr,
"The Output(Out) should not be null if the Input(X) is set."); "The Output(Out) should not be null if the Input(X) is set.");
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
} }
}; };
......
...@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()), framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
......
...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/batch_norm_op.h" #include "paddle/operators/batch_norm_op.h"
#include "paddle/framework/data_layout.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T> template <typename T>
using EigenArrayMap = using EigenArrayMap =
...@@ -60,15 +62,15 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -60,15 +62,15 @@ class BatchNormOp : public framework::OperatorWithKernel {
"Variance and VarianceOut should share the same memory"); "Variance and VarianceOut should share the same memory");
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const TensorFormat tensor_format = const DataLayout data_layout = framework::StringToDataLayout(
StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format")); ctx->Attrs().Get<std::string>("data_layout"));
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions."); "Input X must have 2 to 5 dimensions.");
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
...@@ -90,7 +92,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -90,7 +92,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("is_test", "").SetDefault(false); AddAttr<bool>("is_test", "").SetDefault(false);
AddAttr<float>("momentum", "").SetDefault(0.9); AddAttr<float>("momentum", "").SetDefault(0.9);
AddAttr<float>("epsilon", "").SetDefault(1e-5); AddAttr<float>("epsilon", "").SetDefault(1e-5);
AddAttr<std::string>("tensor_format", "").SetDefault("NCHW"); AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
AddInput("X", "The input tensor"); AddInput("X", "The input tensor");
AddInput("Scale", AddInput("Scale",
"Scale is a 1-dimensional tensor of size C " "Scale is a 1-dimensional tensor of size C "
...@@ -141,9 +143,9 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -141,9 +143,9 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum"); const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
const std::string tensor_format_str = const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
ctx.Attr<std::string>("tensor_format"); const DataLayout data_layout =
const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); framework::StringToDataLayout(data_layout_str);
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
...@@ -151,8 +153,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -151,8 +153,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
"The Input dim size should be between 2 and 5"); "The Input dim size should be between 2 and 5");
const int N = x_dims[0]; const int N = x_dims[0];
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
const int sample_size = x->numel() / N / C; const int sample_size = x->numel() / N / C;
auto *y = ctx.Output<Tensor>("Y"); auto *y = ctx.Output<Tensor>("Y");
...@@ -177,8 +179,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -177,8 +179,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
saved_mean_e.setZero(); saved_mean_e.setZero();
saved_variance_e.setZero(); saved_variance_e.setZero();
switch (tensor_format) { switch (data_layout) {
case TensorFormat::NCHW: { case DataLayout::kNCHW: {
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C); ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) { for (int nc = 0; nc < N * C; ++nc) {
saved_mean_e(nc % C) += x_arr.col(nc).sum(); saved_mean_e(nc % C) += x_arr.col(nc).sum();
...@@ -191,7 +193,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -191,7 +193,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
saved_variance_e /= N * sample_size; saved_variance_e /= N * sample_size;
break; break;
} }
case TensorFormat::NHWC: { case DataLayout::kNHWC: {
ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size); ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
for (int i = 0; i < N * sample_size; ++i) { for (int i = 0; i < N * sample_size; ++i) {
saved_mean_e += x_arr.col(i); saved_mean_e += x_arr.col(i);
...@@ -205,7 +207,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -205,7 +207,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
break; break;
} }
default: default:
PADDLE_THROW("Unknown storage order: %s", tensor_format_str); PADDLE_THROW("Unknown storage order: %s", data_layout_str);
} }
EigenVectorArrayMap<T> running_mean_arr( EigenVectorArrayMap<T> running_mean_arr(
...@@ -247,8 +249,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -247,8 +249,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
Eigen::Array<T, Eigen::Dynamic, 1> new_bias = Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
bias_arr - mean_arr * inv_std * scale_arr; bias_arr - mean_arr * inv_std * scale_arr;
switch (tensor_format) { switch (data_layout) {
case TensorFormat::NCHW: { case DataLayout::kNCHW: {
EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size, EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
N * C); N * C);
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C); ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
...@@ -257,7 +259,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -257,7 +259,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
} }
break; break;
} }
case TensorFormat::NHWC: { case DataLayout::kNHWC: {
EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C, EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
N * sample_size) = N * sample_size) =
(ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() * (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
...@@ -267,7 +269,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -267,7 +269,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
break; break;
} }
default: default:
PADDLE_THROW("Unknown storage order: %d", tensor_format); PADDLE_THROW("Unknown storage order: %d", data_layout);
} }
} }
}; };
...@@ -290,11 +292,11 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -290,11 +292,11 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), ""); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const TensorFormat tensor_format = const DataLayout data_layout = framework::StringToDataLayout(
StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format")); ctx->Attrs().Get<std::string>("data_layout"));
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
...@@ -302,7 +304,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -302,7 +304,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y")); const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) { if (var == nullptr) {
...@@ -333,9 +335,9 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -333,9 +335,9 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
const auto *saved_mean = ctx.Input<Tensor>("SavedMean"); const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
// SavedVariance have been reverted in forward operator // SavedVariance have been reverted in forward operator
const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance"); const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
const std::string tensor_format_str = const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
ctx.Attr<std::string>("tensor_format"); const DataLayout data_layout =
const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); framework::StringToDataLayout(data_layout_str);
// Get the size for each dimension. // Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width] // NCHW [batch_size, in_channels, in_height, in_width]
...@@ -344,8 +346,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -344,8 +346,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
"The Input dim size should be between 2 and 5"); "The Input dim size should be between 2 and 5");
const int N = x_dims[0]; const int N = x_dims[0];
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
const int sample_size = x->numel() / N / C; const int sample_size = x->numel() / N / C;
ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C); ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
...@@ -376,8 +378,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -376,8 +378,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size); const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
switch (tensor_format) { switch (data_layout) {
case TensorFormat::NCHW: { case DataLayout::kNCHW: {
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C); ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C); ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
...@@ -400,7 +402,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -400,7 +402,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
} }
break; break;
} }
case TensorFormat::NHWC: { case DataLayout::kNHWC: {
ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size); ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size); ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C, EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
...@@ -425,7 +427,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -425,7 +427,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
break; break;
} }
default: default:
PADDLE_THROW("Unknown storage order: %s", tensor_format_str); PADDLE_THROW("Unknown storage order: %s", data_layout_str);
} }
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/batch_norm_op.h" #include "paddle/operators/batch_norm_op.h"
#include "paddle/framework/data_layout.h"
#include <cfloat> #include <cfloat>
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
...@@ -22,12 +23,12 @@ namespace paddle { ...@@ -22,12 +23,12 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout;
template <typename T> template <typename T>
using CudnnDataType = platform::CudnnDataType<T>; using CudnnDataType = platform::CudnnDataType<T>;
void ExtractNCWHD(const framework::DDim &dims, void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
const TensorFormat &tensor_format, int *N, int *C, int *H, int *N, int *C, int *H, int *W, int *D) {
int *W, int *D) {
*N = dims[0]; *N = dims[0];
if (dims.size() == 2) { if (dims.size() == 2) {
*C = dims[1]; *C = dims[1];
...@@ -35,13 +36,13 @@ void ExtractNCWHD(const framework::DDim &dims, ...@@ -35,13 +36,13 @@ void ExtractNCWHD(const framework::DDim &dims,
*W = 1; *W = 1;
*D = 1; *D = 1;
} else { } else {
*C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1]; *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
*H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1]; *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
*W = dims.size() > 3 *W = dims.size() > 3
? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2]) ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
: 1; : 1;
*D = dims.size() > 4 *D = dims.size() > 4
? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3]) ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
: 1; : 1;
} }
} }
...@@ -52,13 +53,13 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -52,13 +53,13 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon")); double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const float momentum = ctx.Attr<float>("momentum"); const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
const std::string tensor_format_str = const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
ctx.Attr<std::string>("tensor_format"); const DataLayout data_layout =
const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); framework::StringToDataLayout(data_layout_str);
// Get the size for each dimension. // Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width] // NCHW [batch_size, in_channels, in_height, in_width]
...@@ -67,7 +68,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -67,7 +68,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5"); "The Input dim size should be between 2 and 5");
int N, C, H, W, D; int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t data_desc_;
...@@ -93,7 +94,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -93,7 +94,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
VLOG(1) << "Setting descriptors."; VLOG(1) << "Setting descriptors.";
std::vector<int> dims; std::vector<int> dims;
std::vector<int> strides; std::vector<int> strides;
if (tensor_format == TensorFormat::NCHW) { if (data_layout == DataLayout::kNCHW) {
dims = {N, C, H, W, D}; dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1}; strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else { } else {
...@@ -178,11 +179,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -178,11 +179,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon")); double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string tensor_format_str = const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
ctx.Attr<std::string>("tensor_format"); const DataLayout data_layout =
const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str); framework::StringToDataLayout(data_layout_str);
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y")); const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale"); const auto *scale = ctx.Input<Tensor>("Scale");
...@@ -192,7 +193,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -192,7 +193,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5"); "The Input dim size should be between 2 and 5");
int N, C, H, W, D; int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
PADDLE_ENFORCE_EQ(scale->dims()[0], C); PADDLE_ENFORCE_EQ(scale->dims()[0], C);
...@@ -219,7 +220,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -219,7 +220,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
std::vector<int> dims; std::vector<int> dims;
std::vector<int> strides; std::vector<int> strides;
if (tensor_format == TensorFormat::NCHW) { if (data_layout == DataLayout::kNCHW) {
dims = {N, C, H, W, D}; dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1}; strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else { } else {
......
...@@ -19,21 +19,6 @@ limitations under the License. */ ...@@ -19,21 +19,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
enum TensorFormat {
NHWC = 0,
NCHW = 1,
};
inline TensorFormat StringToTensorFormat(const std::string& str) {
if (str == "NHWC" || str == "nhwc") {
return TensorFormat::NHWC;
} else if (str == "NCHW" || str == "nchw") {
return TensorFormat::NCHW;
} else {
PADDLE_THROW("Unknown storage order string: %s", str);
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BatchNormKernel : public framework::OpKernel<T> { class BatchNormKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/beam_search_decode_op.h" #include "paddle/operators/beam_search_decode_op.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase { ...@@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
const framework::AttributeMap& attrs) const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override { const platform::Place& dev_place) const override {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
auto& dev_ctx = *pool.Borrow(dev_place);
framework::ExecutionContext ctx(*this, scope, dev_ctx); framework::ExecutionContext ctx(*this, scope, dev_ctx);
const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids"); const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
......
...@@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase {
} }
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override { const platform::Place& dev_place) const override {
LOG(INFO) << "run beam search op"; LOG(INFO) << "run beam search op";
auto ids_var = scope.FindVar(Input("ids")); auto ids_var = scope.FindVar(Input("ids"));
auto scores_var = scope.FindVar(Input("scores")); auto scores_var = scope.FindVar(Input("scores"));
......
...@@ -55,7 +55,7 @@ class ChunkEvalOp : public framework::OperatorWithKernel { ...@@ -55,7 +55,7 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(framework::proto::DataType::FP32, return framework::OpKernelType(framework::proto::DataType::FP32,
ctx.device_context()); ctx.device_context());
......
...@@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel { ...@@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx); framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place // CompareOp kernel's device type is decided by input tensor place
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place(); kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
return kt; return kt;
......
...@@ -98,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel { ...@@ -98,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad, REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
ops::ConcatOpGrad) ops::ConcatOpGrad, false)
REGISTER_OP_CPU_KERNEL(concat, REGISTER_OP_CPU_KERNEL(concat,
ops::ConcatKernel<paddle::platform::CPUPlace, float>) ops::ConcatKernel<paddle::platform::CPUPlace, float>)
REGISTER_OP_CPU_KERNEL(concat_grad, REGISTER_OP_CPU_KERNEL(concat_grad,
......
...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/cond_op.h" #include "paddle/operators/cond_op.h"
#include "paddle/operators/gather.h" #include "paddle/operators/gather.h"
#include "paddle/operators/scatter.h" #include "paddle/operators/scatter.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -193,12 +193,15 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, ...@@ -193,12 +193,15 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
} }
} }
void CondOp::Run(const Scope& scope, void CondOp::Run(const Scope& scope, const platform::Place& place) const {
const platform::DeviceContext& dev_ctx) const { // get device context from pool
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
auto& dev_ctx = *pool.Borrow(place);
PrepareDataForSubnet(scope, dev_ctx); PrepareDataForSubnet(scope, dev_ctx);
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope); std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
for (int i = 0; i < BRANCH_NUM; ++i) { for (int i = 0; i < BRANCH_NUM; ++i) {
sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx); sub_net_op_[i]->Run(*sub_scopes[i], place);
} }
MergeDataFromSubnet(scope, dev_ctx); MergeDataFromSubnet(scope, dev_ctx);
} }
......
...@@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase { ...@@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase {
} }
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override; const platform::Place& place) const override;
private: private:
const int TRUE_BRANCH = 0; const int TRUE_BRANCH = 0;
......
...@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp { ...@@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: ConditionalOp(type, inputs, outputs, attrs) {} : ConditionalOp(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto xs = InputTensors(scope); auto xs = InputTensors(scope);
bool need_run = std::all_of( bool need_run = std::all_of(
xs.begin(), xs.end(), xs.begin(), xs.end(),
...@@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp { ...@@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp {
scopes->front() = &scope.NewScope(); scopes->front() = &scope.NewScope();
auto &cur_scope = *scopes->front(); auto &cur_scope = *scopes->front();
framework::Executor exec(dev_place);
auto *block = Attr<framework::BlockDesc *>("sub_block"); auto *block = Attr<framework::BlockDesc *>("sub_block");
framework::Executor exec(dev_ctx);
exec.Run(*block->Program(), &cur_scope, block->ID(), false); exec.Run(*block->Program(), &cur_scope, block->ID(), false);
} }
} }
...@@ -104,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -104,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: ConditionalOp(type, inputs, outputs, attrs) {} : ConditionalOp(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto xs = this->InputTensors(scope); auto xs = this->InputTensors(scope);
bool need_run = std::all_of( bool need_run = std::all_of(
xs.begin(), xs.end(), xs.begin(), xs.end(),
...@@ -116,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -116,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp {
auto &scopes = scope_var->Get<std::vector<framework::Scope *>>(); auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
framework::Scope &cur_scope = *scopes[0]; framework::Scope &cur_scope = *scopes[0];
framework::Executor exec(dev_place);
auto *block = Attr<framework::BlockDesc *>("sub_block"); auto *block = Attr<framework::BlockDesc *>("sub_block");
framework::Executor exec(dev_ctx);
exec.Run(*block->Program(), &cur_scope, block->ID(), false); exec.Run(*block->Program(), &cur_scope, block->ID(), false);
AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"), AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
Outputs(framework::GradVarName("Params"))); Outputs(framework::GradVarName("Params")));
AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"), AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
Outputs(framework::GradVarName("X"))); Outputs(framework::GradVarName("X")));
} }
} }
private: private:
void AssignLocalGradientToGlobal( void AssignLocalGradientToGlobal(
const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope, const platform::Place &place, const framework::Scope &cur_scope,
const std::vector<std::string> &p_names, const std::vector<std::string> &p_names,
const std::vector<std::string> &pg_names) const { const std::vector<std::string> &pg_names) const {
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_names.size(); ++i) {
...@@ -144,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -144,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
auto assign = framework::OpRegistry::CreateOp( auto assign = framework::OpRegistry::CreateOp(
"assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}}, "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
framework::AttributeMap{}); framework::AttributeMap{});
assign->Run(cur_scope, dev_ctx); assign->Run(cur_scope, place);
cur_scope.Rename(new_in_grad_name, in_grad_name); cur_scope.Rename(new_in_grad_name, in_grad_name);
} }
} }
...@@ -178,8 +178,9 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { ...@@ -178,8 +178,9 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
grad_op->SetInput("Out", Output("Out")); grad_op->SetInput("Out", Output("Out"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetInput("Scope", Output("Scope")); grad_op->SetInput("Scope", Output("Scope"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X")); grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params")); grad_op->SetOutput(framework::GradVarName("Params"),
InputGrad("Params", false));
grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]); grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
return std::unique_ptr<framework::OpDesc>(grad_op); return std::unique_ptr<framework::OpDesc>(grad_op);
} }
......
...@@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> { ...@@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter"); auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
...@@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> { ...@@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory // Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
...@@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input"); auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter"); auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output")); auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
...@@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
......
...@@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter"); auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
...@@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory // Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward --------------------- // ------------------- cudnn conv transpose forward ---------------------
...@@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto input = ctx.Input<Tensor>("Input"); auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter"); auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output")); auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
...@@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call. // FIXME(typhoonzero): template type T may not be the same as cudnn call.
......
...@@ -120,12 +120,18 @@ class CRFDecodingOp : public framework::OperatorWithKernel { ...@@ -120,12 +120,18 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()), framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
ctx.device_context()); ctx.device_context());
} }
framework::OpKernelType GetExpectedKernelType(
const framework::OpKernelType& actual_kernel_type) const override {
return framework::OpKernelType(actual_kernel_type.data_type_,
platform::CPUPlace());
}
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel { ...@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of computation kernel of cross_entropy // Explicitly set that the data type of computation kernel of cross_entropy
// is determined by its input "X". // is determined by its input "X".
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
...@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of computation kernel of cross_entropy // Explicitly set that the data type of computation kernel of cross_entropy
// is determined by its input "X". // is determined by its input "X".
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
......
...@@ -20,25 +20,57 @@ namespace detail { ...@@ -20,25 +20,57 @@ namespace detail {
Status SendRecvServerImpl::SendVariable(ServerContext *context, Status SendRecvServerImpl::SendVariable(ServerContext *context,
const VariableMessage *in_var, const VariableMessage *in_var,
VariableMessage *out_var) { VoidMessage *out_var) {
framework::LoDTensor t; // TODO(typhoonzero): support different variable types.
// TODO(typhoonzero): desirealize in_tensor and run pserver network.
std::istringstream iss(in_var->serialized()); std::istringstream iss(in_var->serialized());
framework::LoDTensor t;
framework::DeserializeFromStream(iss, &t); framework::DeserializeFromStream(iss, &t);
lodtensor_queue_.Push(std::move(t)); TensorWithName tensor_with_name =
// Block util the sub graph is done. std::make_pair(in_var->varname(), std::move(t));
t = lodtensor_return_queue_.Pop();
var_recv_queue_.Push(std::move(tensor_with_name));
return Status::OK;
}
Status SendRecvServerImpl::GetVariable(ServerContext *context,
const VariableMessage *in_var,
VariableMessage *out_var) {
std::string get_var_name = in_var->varname();
auto *var = scope_->FindVar(get_var_name);
auto tensor = var->Get<framework::LoDTensor>();
std::ostringstream oss; std::ostringstream oss;
// FIXME(typhoonzero): get context from op. framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext());
framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
std::string *varname = out_var->mutable_varname(); std::string *varname = out_var->mutable_varname();
*varname = in_var->varname(); *varname = get_var_name;
std::string *serialized = out_var->mutable_serialized(); std::string *serialized = out_var->mutable_serialized();
*serialized = oss.str(); *serialized = oss.str();
return Status::OK;
}
Status SendRecvServerImpl::Wait(ServerContext *context,
const VoidMessage *in_var,
VoidMessage *out_var) {
{
std::unique_lock<std::mutex> lock(this->mutex_);
condition_.wait(lock, [=] { return this->done_ == true; });
}
return Status::OK; return Status::OK;
} }
void SendRecvServerImpl::Reset() {
std::lock_guard<std::mutex> lock(this->mutex_);
done_ = false;
}
void SendRecvServerImpl::Done() {
{
std::lock_guard<std::mutex> lock(this->mutex_);
done_ = true;
}
condition_.notify_all();
}
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -19,10 +19,10 @@ namespace operators { ...@@ -19,10 +19,10 @@ namespace operators {
namespace detail { namespace detail {
bool RPCClient::SendVariable(const framework::Scope& scope, bool RPCClient::SendVariable(const framework::Scope& scope,
const std::string& inname, const std::string& inname) {
const std::string& outname) {
ClientContext context; ClientContext context;
VariableMessage msg, out_msg; VariableMessage msg;
VoidMessage out_msg;
// FIXME(typhoonzero): pass device context to here. // FIXME(typhoonzero): pass device context to here.
auto ctx = platform::CPUDeviceContext(); auto ctx = platform::CPUDeviceContext();
auto* var = scope.FindVar(inname); auto* var = scope.FindVar(inname);
...@@ -37,9 +37,26 @@ bool RPCClient::SendVariable(const framework::Scope& scope, ...@@ -37,9 +37,26 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
msg.set_serialized(oss.str()); msg.set_serialized(oss.str());
Status status = stub_->SendVariable(&context, msg, &out_msg); Status status = stub_->SendVariable(&context, msg, &out_msg);
if (!status.ok()) { if (!status.ok()) {
LOG(ERROR) << "gRPC error: " << status.error_message();
return false; return false;
} }
std::istringstream iss(out_msg.serialized()); return true;
}
bool RPCClient::GetVariable(const framework::Scope& scope,
const std::string& outname) {
ClientContext context;
VariableMessage call_msg, ret_msg;
call_msg.set_varname(outname);
auto ctx = platform::CPUDeviceContext();
Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
if (!status.ok()) {
LOG(ERROR) << "gRPC error: " << status.error_message();
return false;
}
std::istringstream iss(ret_msg.serialized());
framework::LoDTensor ret_tensor; framework::LoDTensor ret_tensor;
framework::DeserializeFromStream(iss, &ret_tensor); framework::DeserializeFromStream(iss, &ret_tensor);
auto* outvar = scope.FindVar(outname); auto* outvar = scope.FindVar(outname);
...@@ -49,6 +66,12 @@ bool RPCClient::SendVariable(const framework::Scope& scope, ...@@ -49,6 +66,12 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
return true; return true;
} }
void RPCClient::Wait() {
ClientContext context;
VoidMessage call_msg, ret_msg;
stub_->Wait(&context, call_msg, &ret_msg);
}
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -19,7 +19,12 @@ package sendrecv; ...@@ -19,7 +19,12 @@ package sendrecv;
service SendRecvService { service SendRecvService {
// For parameter server round-robin like hashing, do not split tensors. // For parameter server round-robin like hashing, do not split tensors.
// Send and recv only one tensor // Send and recv only one tensor
rpc SendVariable(VariableMessage) returns (VariableMessage) {} // TODO(typhoonzero): add streaming API
rpc SendVariable(VariableMessage) returns (VoidMessage) {}
// Argument VariableMessage for GetVariable should only contain varname.
rpc GetVariable(VariableMessage) returns (VariableMessage) {}
// wait for one execution of the program
rpc Wait(VoidMessage) returns (VoidMessage) {}
} }
// VariableMessage is serialized paddle variable message. // VariableMessage is serialized paddle variable message.
......
...@@ -20,10 +20,6 @@ ...@@ -20,10 +20,6 @@
#include "paddle/framework/selected_rows.h" #include "paddle/framework/selected_rows.h"
#include "paddle/operators/detail/simple_block_queue.h" #include "paddle/operators/detail/simple_block_queue.h"
// #include <grpc++/channel.h>
// #include <grpc++/client_context.h>
// #include <grpc++/create_channel.h>
// #include <grpc++/security/credentials.h>
#include "paddle/operators/detail/send_recv.grpc.pb.h" #include "paddle/operators/detail/send_recv.grpc.pb.h"
#include "paddle/operators/detail/send_recv.pb.h" #include "paddle/operators/detail/send_recv.pb.h"
...@@ -48,24 +44,32 @@ namespace paddle { ...@@ -48,24 +44,32 @@ namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
typedef std::pair<std::string, framework::LoDTensor> TensorWithName;
class SendRecvServerImpl final : public SendRecvService::Service { class SendRecvServerImpl final : public SendRecvService::Service {
public: public:
explicit SendRecvServerImpl() {} explicit SendRecvServerImpl() {}
Status SendVariable(ServerContext *context, const VariableMessage *in_var, Status SendVariable(ServerContext *context, const VariableMessage *in_var,
VariableMessage *out_var) override; VoidMessage *out_var) override;
Status GetVariable(ServerContext *context, const VariableMessage *in_var,
const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); } VariableMessage *out_var) override;
Status Wait(ServerContext *context, const VoidMessage *in_var,
VoidMessage *out_var) override;
void Reset();
void Done();
void SetScope(framework::Scope *scope) { scope_ = scope; };
void Push(const framework::LoDTensor &tensor) { const TensorWithName Get() { return this->var_recv_queue_.Pop(); }
this->lodtensor_return_queue_.Push(tensor);
}
private: private:
SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_; // received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_; SimpleBlockQueue<TensorWithName> var_recv_queue_;
SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_; framework::Scope *scope_;
SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_; // condition of the sub program
std::mutex mutex_;
bool done_;
std::condition_variable condition_;
}; };
// RPCClient is a class to send tensors to pserver sub-network // RPCClient is a class to send tensors to pserver sub-network
...@@ -75,8 +79,9 @@ class RPCClient { ...@@ -75,8 +79,9 @@ class RPCClient {
RPCClient(std::shared_ptr<Channel> channel) RPCClient(std::shared_ptr<Channel> channel)
: stub_(SendRecvService::NewStub(channel)) {} : stub_(SendRecvService::NewStub(channel)) {}
bool SendVariable(const framework::Scope &scope, const std::string &inname, bool SendVariable(const framework::Scope &scope, const std::string &inname);
const std::string &outname); bool GetVariable(const framework::Scope &scope, const std::string &outname);
void Wait();
private: private:
std::unique_ptr<SendRecvService::Stub> stub_; std::unique_ptr<SendRecvService::Stub> stub_;
......
...@@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> { ...@@ -35,7 +35,7 @@ struct StridedMemcpyFunctor<T, 1> {
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::GPUPlace>(place); auto& gpu_place = boost::get<platform::CUDAPlace>(place);
auto& cuda_ctx = auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx); reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
......
...@@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase { ...@@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto feed_var_name = Input("X"); auto feed_var_name = Input("X");
auto *feed_var = scope.FindVar(feed_var_name); auto *feed_var = scope.FindVar(feed_var_name);
...@@ -47,7 +47,12 @@ class FeedOp : public framework::OperatorBase { ...@@ -47,7 +47,12 @@ class FeedOp : public framework::OperatorBase {
auto &feed_list = feed_var->Get<framework::FeedFetchList>(); auto &feed_list = feed_var->Get<framework::FeedFetchList>();
auto &feed_item = feed_list.at(static_cast<size_t>(col)); auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedFetchType>(); auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::CopyFrom(feed_item, place, dev_ctx, out_item);
out_item->set_lod(feed_item.lod()); out_item->set_lod(feed_item.lod());
} }
}; };
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase { ...@@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto fetch_var_name = Input("X"); auto fetch_var_name = Input("X");
auto *fetch_var = scope.FindVar(fetch_var_name); auto *fetch_var = scope.FindVar(fetch_var_name);
PADDLE_ENFORCE(fetch_var != nullptr, PADDLE_ENFORCE(fetch_var != nullptr,
...@@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase { ...@@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate // FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs? // CPU outputs?
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
dev_ctx.Wait(); dev_ctx.Wait();
dst_item.set_lod(src_item.lod()); dst_item.set_lod(src_item.lod());
......
...@@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel { ...@@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")), static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/framework/data_type.h" #include "paddle/framework/data_type.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase { ...@@ -33,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase {
public: public:
using framework::OperatorBase::OperatorBase; using framework::OperatorBase::OperatorBase;
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto data_type = auto data_type =
static_cast<framework::proto::DataType>(Attr<int>("dtype")); static_cast<framework::proto::DataType>(Attr<int>("dtype"));
auto value = Attr<float>("value"); auto value = Attr<float>("value");
...@@ -45,8 +46,11 @@ class FillConstantOp : public framework::OperatorBase { ...@@ -45,8 +46,11 @@ class FillConstantOp : public framework::OperatorBase {
auto cpu = platform::CPUPlace(); auto cpu = platform::CPUPlace();
out.mutable_data(cpu, framework::ToTypeIndex(data_type)); out.mutable_data(cpu, framework::ToTypeIndex(data_type));
} else { } else {
out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type)); out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
} }
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(dev_place);
math::set_constant(dev_ctx, &out, value); math::set_constant(dev_ctx, &out, value);
} }
}; };
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/framework/data_type.h" #include "paddle/framework/data_type.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h" #include "paddle/operators/detail/safe_ref.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -42,7 +43,7 @@ class FillOp : public framework::OperatorBase { ...@@ -42,7 +43,7 @@ class FillOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto &out = auto &out =
detail::Ref(detail::Ref(scope.FindVar(Output("Out")), detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
"Cannot find variable %s", Output("Out")) "Cannot find variable %s", Output("Out"))
...@@ -51,12 +52,11 @@ class FillOp : public framework::OperatorBase { ...@@ -51,12 +52,11 @@ class FillOp : public framework::OperatorBase {
auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype")); auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
platform::CPUPlace cpu; platform::CPUPlace cpu;
auto force_cpu = Attr<bool>("force_cpu"); auto force_cpu = Attr<bool>("force_cpu");
out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(), out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
framework::ToTypeIndex(dtype));
framework::LoDTensor tensor; framework::LoDTensor tensor;
if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) { if (force_cpu || platform::is_cpu_place(place)) {
tensor.ShareDataWith(out); tensor.ShareDataWith(out);
} else { } else {
// Always make tensor in CPU memory. // Always make tensor in CPU memory.
...@@ -67,9 +67,11 @@ class FillOp : public framework::OperatorBase { ...@@ -67,9 +67,11 @@ class FillOp : public framework::OperatorBase {
framework::VisitDataType( framework::VisitDataType(
dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value"))); dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) { if (!force_cpu && platform::is_gpu_place(place)) {
// Copy tensor to out // Copy tensor to out
framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out); platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::CopyFrom(tensor, place, dev_ctx, &out);
} }
} }
}; };
......
...@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel { ...@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
...@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel { ...@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
......
...@@ -57,7 +57,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { ...@@ -57,7 +57,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")), static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
......
...@@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase { ...@@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto &out = auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
......
...@@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase { ...@@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
// get input // get input
auto *var = scope.FindVar(Input(kInput)); auto *var = scope.FindVar(Input(kInput));
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var);
......
...@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { ...@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of computation kernel of linear_chain_crf // Explicitly set that the data type of computation kernel of linear_chain_crf
// is determined by its input "Emission". // is determined by its input "Emission".
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()), framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
...@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { ...@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of output of the linear_chain_crf_grad // Explicitly set that the data type of output of the linear_chain_crf_grad
// operator is determined by its input: gradients of LogLikelihood. // operator is determined by its input: gradients of LogLikelihood.
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType( framework::ToDataType(
......
...@@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> { ...@@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
// operators runs on GPU device. // operators runs on GPU device.
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
Tensor* dst) { Tensor* dst) {
dst->mutable_data<T>(platform::GPUPlace()); dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(src, platform::GPUPlace(), ctx, dst); framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst);
}; };
copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, emission_exps_src, emission_exps_dst);
copyTensor(ctx, transition_exps_src, transition_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst);
...@@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> { ...@@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src, auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
Tensor* dst) { Tensor* dst) {
if (src && dst) { if (src && dst) {
dst->mutable_data<T>(platform::GPUPlace()); dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst); framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst);
} }
}; };
copyTensor(ctx, emission_grad_src, emission_grad_dst); copyTensor(ctx, emission_grad_src, emission_grad_dst);
......
...@@ -11,10 +11,10 @@ ...@@ -11,10 +11,10 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <fstream>
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
#include <fstream>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase { ...@@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
std::ifstream fin(filename); std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op", PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
...@@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase { ...@@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase {
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
framework::DeserializeFromStream(fin, tensor); framework::DeserializeFromStream(fin, tensor);
auto place = dev_ctx.GetPlace(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
// copy CPU to GPU // copy CPU to GPU
framework::LoDTensor cpu_tensor; framework::LoDTensor cpu_tensor;
......
...@@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase { ...@@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>(); auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
auto &out = auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
......
...@@ -24,7 +24,7 @@ class LoDRankTableOp : public framework::OperatorBase { ...@@ -24,7 +24,7 @@ class LoDRankTableOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto *out = auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>(); scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
......
...@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
...@@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel { ...@@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h" #include "paddle/operators/detail/safe_ref.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -32,7 +33,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -32,7 +33,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s", auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
Input("X")) Input("X"))
.Get<framework::LoDTensor>(); .Get<framework::LoDTensor>();
...@@ -86,6 +87,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -86,6 +87,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
// out[i][offset: offset+len] = x[each_range.begin: each_range.end] // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
auto slice = out[i].Slice(static_cast<int>(offset), auto slice = out[i].Slice(static_cast<int>(offset),
static_cast<int>(offset + len)); static_cast<int>(offset + len));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin), framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)), static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice); x.place(), dev_ctx, &slice);
......
...@@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel { ...@@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx); framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
// LogicalOp kernel's device type is decided by input tensor place // LogicalOp kernel's device type is decided by input tensor place
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place(); kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
return kt; return kt;
......
...@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel { ...@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("W")->type()), framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
...@@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { ...@@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("W")->type()), framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
......
...@@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
// copy GPU memory to CPU pinned memory // copy GPU memory to CPU pinned memory
framework::Vector<int64_t> new_rows; framework::Vector<int64_t> new_rows;
new_rows.resize(ids_dim[0]); new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::GPUPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data, memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
ids_dim[0] * sizeof(int64_t), stream); ids_dim[0] * sizeof(int64_t), stream);
......
...@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel { ...@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
...@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel { ...@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
......
...@@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> { ...@@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* x_tensor = ctx.Input<framework::Tensor>("X"); auto* x_tensor = ctx.Input<framework::Tensor>("X");
auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev"); auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
...@@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto x_tensor = ctx.Input<Tensor>("X"); auto x_tensor = ctx.Input<Tensor>("X");
auto c_prev_tensor = ctx.Input<Tensor>("C_prev"); auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
......
...@@ -159,6 +159,7 @@ void testIm2col() { ...@@ -159,6 +159,7 @@ void testIm2col() {
TEST(math, im2col) { TEST(math, im2col) {
testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
testIm2col<paddle::platform::CUDADeviceContext, paddle::platform::GPUPlace>(); testIm2col<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>();
#endif #endif
} }
...@@ -277,14 +277,6 @@ void set_constant_with_place<platform::CPUPlace>( ...@@ -277,14 +277,6 @@ void set_constant_with_place<platform::CPUPlace>(
TensorSetConstantCPU(tensor, value)); TensorSetConstantCPU(tensor, value));
} }
template <>
void set_constant_with_place<platform::MKLDNNPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()),
TensorSetConstantCPU(tensor, value));
}
struct TensorSetConstantWithPlace : public boost::static_visitor<void> { struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
TensorSetConstantWithPlace(const platform::DeviceContext& context, TensorSetConstantWithPlace(const platform::DeviceContext& context,
framework::Tensor* tensor, float value) framework::Tensor* tensor, float value)
......
...@@ -105,7 +105,7 @@ void matmul<platform::CUDADeviceContext, float>( ...@@ -105,7 +105,7 @@ void matmul<platform::CUDADeviceContext, float>(
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()), platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in GPUPlace"); "Matrix must all be in CUDAPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -134,7 +134,7 @@ void matmul<platform::CUDADeviceContext, double>( ...@@ -134,7 +134,7 @@ void matmul<platform::CUDADeviceContext, double>(
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()), platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in GPUPlace"); "Matrix must all be in CUDAPlace");
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
...@@ -266,20 +266,13 @@ struct TensorSetConstantGPU { ...@@ -266,20 +266,13 @@ struct TensorSetConstantGPU {
}; };
template <> template <>
void set_constant_with_place<platform::GPUPlace>( void set_constant_with_place<platform::CUDAPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor, const platform::DeviceContext& context, framework::Tensor* tensor,
float value) { float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()), framework::VisitDataType(framework::ToDataType(tensor->type()),
TensorSetConstantGPU(context, tensor, value)); TensorSetConstantGPU(context, tensor, value));
} }
template <>
void set_constant_with_place<platform::CUDNNPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
set_constant_with_place<platform::GPUPlace>(context, tensor, value);
}
template struct RowwiseAdd<platform::CUDADeviceContext, float>; template struct RowwiseAdd<platform::CUDADeviceContext, float>;
template struct RowwiseAdd<platform::CUDADeviceContext, double>; template struct RowwiseAdd<platform::CUDADeviceContext, double>;
template struct ColwiseSum<platform::CUDADeviceContext, float>; template struct ColwiseSum<platform::CUDADeviceContext, float>;
......
...@@ -94,8 +94,8 @@ class ColwiseSum<platform::CPUDeviceContext, T> { ...@@ -94,8 +94,8 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
T* out_buf = out->mutable_data<T>(out->place()); T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
for (size_t i = 0; i < height; ++i) { for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
for (size_t j = 0; j < size; ++j) { for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
if (i == 0) { if (i == 0) {
out_buf[j] = in_buf[i * size + j]; out_buf[j] = in_buf[i * size + j];
} else { } else {
......
...@@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) { ...@@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) {
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) { ...@@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) {
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) { ...@@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
...@@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) { ...@@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) {
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place); T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place); T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
auto* gpu_place = new paddle::platform::GPUPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::framework::Tensor g_mat_a; paddle::framework::Tensor g_mat_a;
paddle::framework::Tensor g_vec_b; paddle::framework::Tensor g_vec_b;
paddle::framework::Tensor g_vec_c; paddle::framework::Tensor g_vec_c;
......
...@@ -58,15 +58,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -58,15 +58,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy( memory::Copy(
boost::get<platform::GPUPlace>(out_place), out_data, boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::GPUPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T), in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream()); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::GPUPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
out_data + in1_value.numel(), out_data + in1_value.numel(),
boost::get<platform::GPUPlace>(in2_place), in2_data, boost::get<platform::CUDAPlace>(in2_place), in2_data,
in2_value.numel() * sizeof(T), context.stream()); in2_value.numel() * sizeof(T), context.stream());
} }
}; };
...@@ -160,9 +160,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -160,9 +160,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
memory::Copy(boost::get<platform::GPUPlace>(in2_place), memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
in2_data + input2_offset, in2_data + input2_offset,
boost::get<platform::GPUPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T), context.stream()); in1_value.numel() * sizeof(T), context.stream());
} }
}; };
......
...@@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::operators::math; using namespace paddle::operators::math;
GPUPlace gpu_place(0); CUDAPlace gpu_place(0);
CPUPlace cpu_place; CPUPlace cpu_place;
CUDADeviceContext ctx(gpu_place); CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor; SetConstant<CUDADeviceContext, float> functor;
...@@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::operators::math; using namespace paddle::operators::math;
GPUPlace gpu_place(0); CUDAPlace gpu_place(0);
CPUPlace cpu_place; CPUPlace cpu_place;
CUDADeviceContext ctx(gpu_place); CUDADeviceContext ctx(gpu_place);
SetConstant<CUDADeviceContext, float> functor; SetConstant<CUDADeviceContext, float> functor;
......
...@@ -122,6 +122,6 @@ TEST(math, vol2col) { ...@@ -122,6 +122,6 @@ TEST(math, vol2col) {
testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>(); testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
testVol2col<paddle::platform::CUDADeviceContext, testVol2col<paddle::platform::CUDADeviceContext,
paddle::platform::GPUPlace>(); paddle::platform::CUDAPlace>();
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
} }
...@@ -28,7 +28,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase { ...@@ -28,7 +28,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto &rank_table = auto &rank_table =
scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>(); scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
auto *out = auto *out =
......
...@@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(dev_place);
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>(); auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>(); auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
......
...@@ -113,7 +113,7 @@ This operator is used to perform matrix multiplication for input $X$ and $Y$. ...@@ -113,7 +113,7 @@ This operator is used to perform matrix multiplication for input $X$ and $Y$.
The equation is: The equation is:
$$Out = X * Y$$ $$Out = X * Y$$
Both the input $X$ and $Y$ can carry the LoD (Level of Details) information, Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
or not. But the output only shares the LoD information with input $X$. or not. But the output only shares the LoD information with input $X$.
......
...@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel { ...@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()), framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
...@@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel { ...@@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()), framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
......
...@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> { ...@@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
for (auto i = 0; i < rows; i++) { for (auto i = 0; i < rows; i++) {
int32_t k = index[i]; int32_t k = index[i];
PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
...@@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { ...@@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace()); platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
for (auto i = 0; i < rows; i++) { for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]); size_t k = static_cast<size_t>(index[i]);
if (d_ins[k]) { if (d_ins[k]) {
......
...@@ -24,7 +24,7 @@ class NCCLInitOp : public framework::OperatorBase { ...@@ -24,7 +24,7 @@ class NCCLInitOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
const auto &name = Output("Communicator"); const auto &name = Output("Communicator");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
"Can not find variable '%s' in the scope.", name); "Can not find variable '%s' in the scope.", name);
......
...@@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
// device id // device id
int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
...@@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
// device id // device id
int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
auto ins_names = ctx.Inputs("X"); auto ins_names = ctx.Inputs("X");
...@@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
ctx.device_context()) ctx.device_context())
.stream(); .stream();
// device id // device id
int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
if (idx == root) { if (idx == root) {
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <vector> #include <vector>
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/init.h"
#include "paddle/framework/op_desc.h" #include "paddle/framework/op_desc.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
...@@ -49,9 +50,9 @@ const f::DDim kDims = {100, 100}; ...@@ -49,9 +50,9 @@ const f::DDim kDims = {100, 100};
class NCCLTester : public ::testing::Test { class NCCLTester : public ::testing::Test {
public: public:
virtual void SetUp() override { virtual void SetUp() override {
cpu_ctx = new p::CPUDeviceContext(p::CPUPlace()); paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list.size(); ++i) {
p::GPUPlace place(i); p::CUDAPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
} }
...@@ -65,6 +66,7 @@ class NCCLTester : public ::testing::Test { ...@@ -65,6 +66,7 @@ class NCCLTester : public ::testing::Test {
} }
void NCCLInitOp() { void NCCLInitOp() {
paddle::platform::CPUPlace cpu_place;
std::unique_ptr<f::OpDesc> op1(new f::OpDesc); std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
op1->SetType("ncclInit"); op1->SetType("ncclInit");
...@@ -76,7 +78,7 @@ class NCCLTester : public ::testing::Test { ...@@ -76,7 +78,7 @@ class NCCLTester : public ::testing::Test {
auto op = f::OpRegistry::CreateOp(*op1); auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp."; VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, *cpu_ctx); op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished."; VLOG(1) << "NCCLInitOp finished.";
} }
...@@ -85,7 +87,7 @@ class NCCLTester : public ::testing::Test { ...@@ -85,7 +87,7 @@ class NCCLTester : public ::testing::Test {
std::unique_lock<std::mutex> lk(mu); std::unique_lock<std::mutex> lk(mu);
const f::OpDesc *op1 = &op_desc; const f::OpDesc *op1 = &op_desc;
p::GPUPlace place(gpu_id); p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs.at(gpu_id); auto &ctx = dev_ctxs.at(gpu_id);
auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>(); auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
...@@ -111,13 +113,12 @@ class NCCLTester : public ::testing::Test { ...@@ -111,13 +113,12 @@ class NCCLTester : public ::testing::Test {
VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
VLOG(1) << " send_tensor : " << send_tensor->numel() VLOG(1) << " send_tensor : " << send_tensor->numel()
<< " recv_tensor : " << recv_tensor->numel(); << " recv_tensor : " << recv_tensor->numel();
op->Run(*scope, *ctx); op->Run(*scope, place);
VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
} }
public: public:
std::vector<p::DeviceContext *> dev_ctxs; std::vector<p::DeviceContext *> dev_ctxs;
p::DeviceContext *cpu_ctx;
f::Scope g_scope; f::Scope g_scope;
std::mutex mu; std::mutex mu;
}; };
...@@ -131,14 +132,14 @@ TEST(NCCL, ncclInitOp) { ...@@ -131,14 +132,14 @@ TEST(NCCL, ncclInitOp) {
op_desc->SetAttr("gpus", {gpu_list}); op_desc->SetAttr("gpus", {gpu_list});
f::Scope g_scope; f::Scope g_scope;
std::unique_ptr<p::DeviceContext> ctx(new p::CPUDeviceContext(p::CPUPlace())); paddle::platform::CPUPlace cpu_place;
auto *var = g_scope.Var("x1"); auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>(); var->GetMutable<p::Communicator>();
auto op = f::OpRegistry::CreateOp(*op_desc); auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp."; VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, *ctx.get()); op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished."; VLOG(1) << "NCCLInitOp finished.";
} }
...@@ -170,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -170,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
for (size_t i = 0; i < dev_scopes.size(); ++i) { for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[i]); p::CUDAPlace gpu_place(gpu_list[i]);
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -179,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -179,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[i]), rt, cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
...@@ -218,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -218,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[kRoot]); p::CUDAPlace gpu_place(gpu_list[kRoot]);
auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -228,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -228,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt, cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
...@@ -267,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -267,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
float result = kRoot; float result = kRoot;
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::GPUPlace gpu_place(gpu_list[idx]); p::CUDAPlace gpu_place(gpu_list[idx]);
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -276,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -276,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt, cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
...@@ -294,9 +295,18 @@ int main(int argc, char **argv) { ...@@ -294,9 +295,18 @@ int main(int argc, char **argv) {
return 0; return 0;
} }
for (int i = 0; i < dev_count; ++i) { std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i); gpu_list.emplace_back(i);
} }
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
// device context should be release before scope. // device context should be release before scope.
......
...@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel { ...@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
...@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { ...@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
......
...@@ -65,9 +65,9 @@ class NetOp : public framework::OperatorBase { ...@@ -65,9 +65,9 @@ class NetOp : public framework::OperatorBase {
* will be used. * will be used.
*/ */
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override { const platform::Place& place) const override {
for (auto& op : ops_) { for (auto& op : ops_) {
op->Run(scope, dev_ctx); op->Run(scope, place);
} }
} }
......
...@@ -13,8 +13,7 @@ class TestOp : public framework::OperatorBase { ...@@ -13,8 +13,7 @@ class TestOp : public framework::OperatorBase {
public: public:
using framework::OperatorBase::OperatorBase; using framework::OperatorBase::OperatorBase;
DEFINE_OP_CLONE_METHOD(TestOp); DEFINE_OP_CLONE_METHOD(TestOp);
void Run(const Scope& scope, void Run(const Scope& scope, const platform::Place& place) const override {
const platform::DeviceContext& dev_ctx) const override {
++run_cnt; ++run_cnt;
} }
}; };
......
...@@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> { ...@@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
const Tensor *input = ctx.Input<Tensor>("X"); const Tensor *input = ctx.Input<Tensor>("X");
Tensor *output = ctx.Output<Tensor>("Out"); Tensor *output = ctx.Output<Tensor>("Out");
...@@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> { ...@@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
const Tensor *input = ctx.Input<Tensor>("X"); const Tensor *input = ctx.Input<Tensor>("X");
const Tensor *output = ctx.Input<Tensor>("Out"); const Tensor *output = ctx.Input<Tensor>("Out");
......
...@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { ...@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
...@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { ...@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......
...@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel { ...@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Score")->type()), framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
...@@ -154,13 +154,14 @@ class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -154,13 +154,14 @@ class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
"Noting that reducing on the first dim will make the LoD info lost.") "Noting that reducing on the first dim will make the LoD info lost.")
.SetDefault(0); .SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
model performance. performance.
Within some context, e.g. the "query", a LTR model generates scores
for a list of items, which gives a partial order of the items. Within some context, e.g. the "query", a LTR model generates scores for a list
PositiveNegativePairOp takes a list of reference rank order of items, which gives a partial order of the items. PositiveNegativePairOp
(Input("Label")) and the model generated scores (Input(Score)) as takes a list of reference rank order (Input("Label")) and the model generated
inputs and counts the pairs that ranked correctly and incorrectly. scores (Input(Score)) as inputs and counts the pairs that ranked correctly
and incorrectly.
)DOC"); )DOC");
} }
}; };
......
...@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel { ...@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()), framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
......
...@@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase { ...@@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase {
: RecurrentBase(type, inputs, outputs, attrs) {} : RecurrentBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope)); auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
VLOG(3) << "Static RNN input sequence length = " << seq_len; VLOG(3) << "Static RNN input sequence length = " << seq_len;
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
framework::Executor executor(dev_ctx); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
for (size_t i = 0; i < seq_len; ++i) { for (size_t i = 0; i < seq_len; ++i) {
...@@ -270,6 +271,10 @@ class RecurrentOp : public RecurrentBase { ...@@ -270,6 +271,10 @@ class RecurrentOp : public RecurrentBase {
executor.Run(*program, &cur_scope, block->ID(), executor.Run(*program, &cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/);
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
// Copy inside::output -> outside::output // Copy inside::output -> outside::output
// outside::output[seq_offset: seq_offset + 1] = inside::output // outside::output[seq_offset: seq_offset + 1] = inside::output
this->LinkTensorWithCallback( this->LinkTensorWithCallback(
...@@ -278,14 +283,13 @@ class RecurrentOp : public RecurrentBase { ...@@ -278,14 +283,13 @@ class RecurrentOp : public RecurrentBase {
framework::LoDTensor *dst_tensor) { framework::LoDTensor *dst_tensor) {
if (i == 0) { // create output tensor at begin if (i == 0) { // create output tensor at begin
dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims())); dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type()); dst_tensor->mutable_data(place, src_tensor.type());
} }
auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
// Explicit copy output since the local RNN scope can be destroyed // Explicit copy output since the local RNN scope can be destroyed
// early. // early.
framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx, framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out);
&dst_out);
}); });
scopes.Next(); scopes.Next();
...@@ -311,15 +315,20 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -311,15 +315,20 @@ class RecurrentGradOp : public RecurrentBase {
: RecurrentBase(type, inputs, outputs, attrs) {} : RecurrentBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto seq_len = static_cast<size_t>(GetSequenceLength(scope)); auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
framework::Executor executor(dev_ctx); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
for (size_t step_id = 0; step_id < seq_len; ++step_id) { for (size_t step_id = 0; step_id < seq_len; ++step_id) {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
...@@ -366,8 +375,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -366,8 +375,7 @@ class RecurrentGradOp : public RecurrentBase {
auto *cur_grad_var = cur_scope.Var(cur_grad); auto *cur_grad_var = cur_scope.Var(cur_grad);
auto cur_grad_tensor = auto cur_grad_tensor =
cur_grad_var->GetMutable<framework::LoDTensor>(); cur_grad_var->GetMutable<framework::LoDTensor>();
framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx, framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor);
cur_grad_tensor);
} }
} }
...@@ -410,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -410,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase {
auto zero_op = framework::OpRegistry::CreateOp( auto zero_op = framework::OpRegistry::CreateOp(
"fill_constant", framework::VariableNameMap{}, "fill_constant", framework::VariableNameMap{},
{{"Out", {pg_names[param_id]}}}, attrs); {{"Out", {pg_names[param_id]}}}, attrs);
zero_op->Run(scope, dev_ctx); zero_op->Run(scope, place);
} }
auto new_inside_name = cur_scope.Rename(inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name);
...@@ -419,7 +427,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -419,7 +427,7 @@ class RecurrentGradOp : public RecurrentBase {
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
sum_op->Run(cur_scope, dev_ctx); sum_op->Run(cur_scope, place);
cur_scope.Rename(new_inside_name, inside_grad_name); cur_scope.Rename(new_inside_name, inside_grad_name);
} }
...@@ -437,11 +445,11 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -437,11 +445,11 @@ class RecurrentGradOp : public RecurrentBase {
} }
if (step_id == 0) { // alloc memory if (step_id == 0) { // alloc memory
outside->Resize(PrependDims(seq_len, inside.dims())); outside->Resize(PrependDims(seq_len, inside.dims()));
outside->mutable_data(dev_ctx.GetPlace(), inside.type()); outside->mutable_data(place, inside.type());
} }
auto dst = outside->Slice(seq_offset, seq_offset + 1); auto dst = outside->Slice(seq_offset, seq_offset + 1);
framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst); framework::CopyFrom(inside, place, dev_ctx, &dst);
}); });
VLOG(5) << "Link outside gradient finished "; VLOG(5) << "Link outside gradient finished ";
...@@ -453,8 +461,8 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -453,8 +461,8 @@ class RecurrentGradOp : public RecurrentBase {
[&](const framework::LoDTensor &inside, [&](const framework::LoDTensor &inside,
framework::LoDTensor *outside) { framework::LoDTensor *outside) {
outside->Resize(inside.dims()); outside->Resize(inside.dims());
outside->mutable_data(dev_ctx.GetPlace(), inside.type()); outside->mutable_data(place, inside.type());
framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside); framework::CopyFrom(inside, place, dev_ctx, outside);
}); });
VLOG(5) << "Link initialize state gradient finished "; VLOG(5) << "Link initialize state gradient finished ";
} }
...@@ -570,7 +578,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -570,7 +578,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
for (auto &input_param : this->InputNames()) { for (auto &input_param : this->InputNames()) {
grad->SetInput(input_param, this->Input(input_param)); grad->SetInput(input_param, this->Input(input_param));
grad->SetOutput(framework::GradVarName(input_param), grad->SetOutput(framework::GradVarName(input_param),
this->InputGrad(input_param)); this->InputGrad(input_param, false));
} }
for (auto &output_param : this->OutputNames()) { for (auto &output_param : this->OutputNames()) {
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/operators/detail/send_recv_impl.h" #include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/simple_block_queue.h" #include "paddle/operators/detail/simple_block_queue.h"
...@@ -61,29 +62,78 @@ class RecvOp : public framework::OperatorBase { ...@@ -61,29 +62,78 @@ class RecvOp : public framework::OperatorBase {
server_thread_->join(); server_thread_->join();
} }
std::string GetGradVarNameForTrainer(const std::string &varname) const {
if (grads_counter_.find(varname) == grads_counter_.end()) {
grads_counter_[varname] = 0;
}
char ret[256];
snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(),
grads_counter_[varname]++);
return std::string(ret);
}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
// blocking get one var from client. // FIXME(typhoonzero): no new scopes for every run.
const framework::LoDTensor &t = rpc_service_->Get();
framework::Scope &recv_scope = scope.NewScope(); framework::Scope &recv_scope = scope.NewScope();
// set graph input var rpc_service_->SetScope(&recv_scope);
auto *var = recv_scope.Var(Input("RX")); auto param_list = Attr<std::vector<std::string>>("ParamList");
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto grad_list = Attr<std::vector<std::string>>("GradList");
// FIXME(typhoonzero): do not copy auto trainer_count = Attr<int>("Trainers");
framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); size_t param_count = param_list.size();
rpc_service_->Reset();
std::string program_str = Attr<std::string>("OptimizeProgram"); // TODO(typhoonzero): change this to a while_op for every cluster-batch.
framework::ProgramDesc program_desc; while (true) {
program_desc.ParseFromString(program_str); // Get from multiple trainers, we don't care about order in which
framework::ProgramDescBind program(program_desc); // the gradient arrives, just add suffix 0~n then average the gradient.
framework::Executor executor(dev_ctx); for (size_t i = 0; i < param_count * trainer_count; ++i) {
// Run sub graph to get optimized tensor // blocking get one var from client.
executor.Run(program, &recv_scope, 0, /*global_block*/ const detail::TensorWithName &v = rpc_service_->Get();
false /*create_local_scope*/); auto grad_var_name = v.first;
auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name);
auto *out_var = recv_scope.FindVar("Out"); std::string param_var_name;
// push back if (it != grad_list.end()) {
rpc_service_->Push(out_var->Get<framework::LoDTensor>()); param_var_name = param_list[it - grad_list.begin()];
} else {
LOG(ERROR) << "grad have no paired param found!";
}
VLOG(3) << "recved grad: " << grad_var_name
<< " updating param: " << param_var_name;
auto *merged_grad = recv_scope.FindVar(grad_var_name);
if (merged_grad == nullptr) {
// create output of merged var.
auto merged_var = recv_scope.Var(grad_var_name);
merged_var->GetMutable<framework::LoDTensor>();
}
if (trainer_count > 1) {
grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
}
auto *var = recv_scope.Var(grad_var_name);
auto *tensor = var->GetMutable<framework::LoDTensor>();
// FIXME(typhoonzero): do not copy
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::CopyFrom(v.second, place, dev_ctx, tensor);
}
rpc_service_->Reset();
std::string program_str = Attr<std::string>("OptimizeProgram");
framework::proto::ProgramDesc program_desc;
program_desc.ParseFromString(program_str);
framework::ProgramDesc program(program_desc);
framework::Executor executor(place);
// Run sub graph to get optimized tensor
try {
executor.Run(program, &recv_scope, 0, /*global_block*/
false /*create_local_scope*/, false /*create_vars*/);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
rpc_service_->Done();
grads_counter_.clear();
} // while(true)
} }
protected: protected:
...@@ -93,13 +143,14 @@ class RecvOp : public framework::OperatorBase { ...@@ -93,13 +143,14 @@ class RecvOp : public framework::OperatorBase {
// grpc send/recv service implement to register. // grpc send/recv service implement to register.
std::shared_ptr<detail::SendRecvServerImpl> rpc_service_; std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
std::shared_ptr<std::thread> server_thread_; std::shared_ptr<std::thread> server_thread_;
mutable std::unordered_map<std::string, int> grads_counter_;
}; };
class RecvOpMaker : public framework::OpProtoAndCheckerMaker { class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker) RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("RX", "(Tensor) Input tensor to be saved"); AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
...@@ -112,6 +163,17 @@ This operator will recv tensor from send_op ...@@ -112,6 +163,17 @@ This operator will recv tensor from send_op
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<std::string>("OptimizeProgram", "type string", AddAttr<std::string>("OptimizeProgram", "type string",
"Serialized ProgramDesc string for recv to run."); "Serialized ProgramDesc string for recv to run.");
AddAttr<std::vector<std::string>>(
"ParamList", "type list of string",
"grad->param name mapping to find which param to optimize.")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"GradList", "type list of string",
"grad->param name mapping to find which param to optimize.")
.SetDefault({});
AddAttr<int>("Trainers", "type int",
"Number of trainers in the current cluster job")
.SetDefault(1);
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
class ReorderLoDTensorByRankTableOpProtoMaker
: public framework::OpProtoAndCheckerMaker {
public:
ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor) the input lod tensor need to be reordered.");
AddInput("RankTable",
"(LoDRankTable) the rank table that input need follow");
AddOutput("Out", "(LoDTensor) reordered lod tensor");
AddComment(R"DOC(ReorderLoDTensorByRankTable
Reorder the input X by the rank of `RankTable`. If `RankTable` is ordered by
index [3, 0, 2, 1]. Input X will reorder its sequence, the third sequence of
X will be the first sequence of Output.
NOTE: The RankTable does not need to be calculated by X.
For example:
The X = [Seq0, Seq1, Seq2, Seq3]. The indices of RankTable are [3, 0, 2, 1].
The Out = [Seq3, Seq0, Seq2, Seq1] with correct LoD information.
)DOC");
}
};
class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
public:
ReorderLoDTensorByRankTableBase(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
auto &x =
detail::Ref(scope.FindVar(Input("X")),
"Cannot find input lod tensor variable %s", Input("X"))
.Get<framework::LoDTensor>();
auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
"Cannot find input rank table variable %s",
Input("RankTable"))
.Get<framework::LoDRankTable>();
auto &out =
*detail::Ref(scope.FindVar(Output("Out")),
"Cannot find output lod tensor variable %s", Output("Out"))
.GetMutable<framework::LoDTensor>();
out.Resize(x.dims());
out.mutable_data(x.place(), x.type());
this->process(place, x, rank_table, &out);
}
protected:
virtual void process(const platform::Place &place,
const framework::LoDTensor &x,
const framework::LoDRankTable &rank_table,
framework::LoDTensor *out) const = 0;
struct AbsoluteRankTableItem {
size_t offset; // the absolute/accumulated offset.
size_t length; // the length
framework::LoD lod;
};
std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
const framework::LoDTensor &x) const {
std::vector<AbsoluteRankTableItem> absolute_table;
size_t level = 0;
size_t size = x.lod()[level].size();
for (size_t i = 0; i < size - 1; ++i) {
auto lod_offset =
framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
auto &offset = lod_offset.second;
absolute_table.emplace_back();
absolute_table.back().length = offset.second - offset.first;
absolute_table.back().offset = offset.first;
absolute_table.back().lod = lod_offset.first;
}
return absolute_table;
}
size_t CopyTensorAndLod(const platform::Place &place,
const AbsoluteRankTableItem &item,
const framework::LoDTensor &x,
framework::LoDTensor *out, size_t out_offset) const {
auto &out_lod = *out->mutable_lod();
auto len = item.length;
auto x_offset = item.offset;
if (out_lod.empty()) {
for (size_t i = 0; i < item.lod.size(); ++i) {
out_lod.push_back(std::vector<size_t>({0}));
}
}
for (size_t i = 0; i < out_lod.size(); ++i) {
auto &out_v = out_lod[i];
auto &new_lod_v = item.lod[i];
for (auto &detail : new_lod_v) {
out_v.push_back(out_v.back() + detail);
}
}
auto x_sliced = x.Slice(x_offset, x_offset + len);
auto out_sliced = out->Slice(out_offset, out_offset + len);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
out_offset += len;
return out_offset;
}
};
class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
public:
ReorderLoDTensorByRankTableOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
protected:
void process(const platform::Place &place, const framework::LoDTensor &x,
const framework::LoDRankTable &rank_table,
framework::LoDTensor *out) const override {
auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
size_t out_offset = 0;
out->mutable_lod()->clear();
for (auto &item : rank_table.items()) {
PADDLE_ENFORCE_LT(item.index, absolute_table.size());
out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
out_offset);
}
}
};
class IdentityInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
context->SetOutputDim("Out", context->GetInputDim("X"));
}
};
class ReorderLodTensorByRankGradOpMaker
: public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("reorder_lod_tensor_by_rank_grad");
grad_op->SetInput("X", OutputGrad("Out"));
grad_op->SetOutput("Out", InputGrad("X"));
grad_op->SetInput("RankTable", Input("RankTable"));
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
public:
ReorderLoDTensorByRankGradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
protected:
void process(const platform::Place &place, const framework::LoDTensor &x,
const framework::LoDRankTable &rank_table,
framework::LoDTensor *out) const override {
auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
// offsets = enumerate([item.index for item in rank_table.items()])
std::vector<std::pair<size_t, size_t>> offsets;
offsets.reserve(rank_table.items().size());
for (size_t i = 0; i < rank_table.items().size(); ++i) {
offsets.push_back({i, rank_table.items()[i].index});
}
// offsets.sort(key=lambda x: x[1])
std::sort(
offsets.begin(), offsets.end(),
[](const std::pair<size_t, size_t> &a,
const std::pair<size_t, size_t> &b) { return a.second < b.second; });
// Copy TensorAndLod
size_t out_offset = 0;
for (auto &offset : offsets) {
out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
x, out, out_offset);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
ops::ReorderLoDTensorByRankTableOp,
ops::ReorderLodTensorByRankGradOpMaker,
ops::ReorderLoDTensorByRankTableOpProtoMaker,
ops::IdentityInferShape);
REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reshape, reshape,
paddle::operators::ReshapeKernel<paddle::platform::GPUPlace, float>); paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reshape_grad, reshape_grad,
paddle::operators::ReshapeGradKernel<paddle::platform::GPUPlace, float>); paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
...@@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase { ...@@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto mem_var_name = Input("X"); auto mem_var_name = Input("X");
auto *mem_var = scope.FindVar(mem_var_name); auto *mem_var = scope.FindVar(mem_var_name);
PADDLE_ENFORCE(mem_var != nullptr, PADDLE_ENFORCE(mem_var != nullptr,
...@@ -77,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -77,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto out_grad_var_name = Input(framework::GradVarName("Out")); auto out_grad_var_name = Input(framework::GradVarName("Out"));
auto *out_grad_var = scope.FindVar(out_grad_var_name); auto *out_grad_var = scope.FindVar(out_grad_var_name);
...@@ -100,7 +100,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -100,7 +100,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
auto zero_op = framework::OpRegistry::CreateOp( auto zero_op = framework::OpRegistry::CreateOp(
"fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs); "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
zero_op->Run(scope, dev_ctx); zero_op->Run(scope, dev_place);
} else { } else {
auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>(); auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>(); auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
......
...@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel { ...@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
...@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel { ...@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......
...@@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load); ...@@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load);
TEST(SaveLoadOp, CPU) { TEST(SaveLoadOp, CPU) {
paddle::framework::Scope scope; paddle::framework::Scope scope;
paddle::platform::CPUPlace place; paddle::platform::CPUPlace place;
paddle::platform::CPUDeviceContext ctx(place);
auto var = scope.Var("test_var"); auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({10, 10}); tensor->Resize({10, 10});
...@@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) { ...@@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) {
auto save_op = paddle::framework::OpRegistry::CreateOp( auto save_op = paddle::framework::OpRegistry::CreateOp(
"save", {{"X", {"test_var"}}}, {}, attrs); "save", {{"X", {"test_var"}}}, {}, attrs);
save_op->Run(scope, ctx); save_op->Run(scope, place);
auto load_var = scope.Var("out_var"); auto load_var = scope.Var("out_var");
auto target = load_var->GetMutable<paddle::framework::LoDTensor>(); auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp( auto load_op = paddle::framework::OpRegistry::CreateOp(
"load", {}, {{"Out", {"out_var"}}}, attrs); "load", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, ctx); load_op->Run(scope, place);
int* actual = target->data<int>(); int* actual = target->data<int>();
for (int64_t i = 0; i < tensor->numel(); ++i) { for (int64_t i = 0; i < tensor->numel(); ++i) {
EXPECT_EQ(expect[i], actual[i]); EXPECT_EQ(expect[i], actual[i]);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase { ...@@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite"); auto overwrite = Attr<bool>("overwrite");
...@@ -88,6 +89,11 @@ class SaveOp : public framework::OperatorBase { ...@@ -88,6 +89,11 @@ class SaveOp : public framework::OperatorBase {
"SaveOp only support LoDTensor, %s has wrong type", iname); "SaveOp only support LoDTensor, %s has wrong type", iname);
auto &tensor = var->Get<framework::LoDTensor>(); auto &tensor = var->Get<framework::LoDTensor>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
framework::SerializeToStream(fout, tensor, dev_ctx); framework::SerializeToStream(fout, tensor, dev_ctx);
} }
}; };
......
...@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel { ...@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
...@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel { ...@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
......
...@@ -34,45 +34,56 @@ class SendOp : public framework::OperatorBase { ...@@ -34,45 +34,56 @@ class SendOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) { : OperatorBase(type, inputs, outputs, attrs) {
// init client when the operator is created at runtime. // init client when the operator is created at runtime.
if (!client_) { std::vector<std::string> endpoints =
std::string endpoint = Attr<std::string>("endpoint"); Attr<std::vector<std::string>>("endpoints");
client_.reset(new detail::RPCClient( for (auto ep : endpoints) {
grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials()))); client_map_[ep].reset(new detail::RPCClient(
// TODO(typhoonzero): how to call InitVariables grpc::CreateChannel(ep, grpc::InsecureChannelCredentials())));
} }
} }
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::DeviceContext &dev_ctx) const override {
auto iname = Input("X"); auto ins = Inputs("X");
auto oname = Output("Out"); std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
// TODO(typhoonzero): currently it's non-blocking, // TODO(typhoonzero): use async calls to send multiple variable asyncly.
// should block until server responds. for (size_t i = 0; i < ins.size(); ++i) {
bool ret = client_->SendVariable(scope, iname, oname); bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]);
if (!ret) { if (!ret) {
LOG(ERROR) << "send variable error"; LOG(ERROR) << "send variable error: " << ins[i];
}
}
// TODO(typhoonzero): support async optimization
client_map_[epmap[0]]->Wait();
for (size_t i = 0; i < ins.size(); ++i) {
bool ret = client_map_[epmap[i]]->GetVariable(scope, ins[i]);
if (!ret) {
LOG(ERROR) << "GetVariable error: " << ins[i];
}
} }
} }
protected: protected:
std::shared_ptr<detail::RPCClient> client_{nullptr}; mutable std::unordered_map<std::string, std::shared_ptr<detail::RPCClient>>
client_map_;
}; };
class SendOpMaker : public framework::OpProtoAndCheckerMaker { class SendOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SendOpMaker(OpProto *proto, OpAttrChecker *op_checker) SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) Input tensor to be saved"); AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
AddOutput("Out", "(Tensor) Output fetched from server");
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
This operator will recv tensor from send_op This operator will recv tensor from send_op
)DOC"); )DOC");
AddAttr<std::string>("endpoint", AddAttr<std::vector<std::string>>("endpoints",
"(string, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
"IP address to listen on.") "Server endpoints to send variables to.");
.SetDefault("127.0.0.1:6164") AddAttr<std::vector<std::string>>("epmap",
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); "(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping");
} }
}; };
......
...@@ -16,12 +16,14 @@ ...@@ -16,12 +16,14 @@
// a RemoteOptimizer. // a RemoteOptimizer.
#include <unistd.h> #include <unistd.h>
#include <string>
#include <thread> #include <thread>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/string/printf.h"
USE_NO_KERNEL_OP(send); USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(recv); USE_NO_KERNEL_OP(recv);
...@@ -33,30 +35,33 @@ std::unique_ptr<paddle::framework::OperatorBase> recv_op; ...@@ -33,30 +35,33 @@ std::unique_ptr<paddle::framework::OperatorBase> recv_op;
void InitTensorsInScope(paddle::framework::Scope &scope, void InitTensorsInScope(paddle::framework::Scope &scope,
paddle::platform::CPUPlace &place) { paddle::platform::CPUPlace &place) {
paddle::platform::CPUDeviceContext ctx(place); paddle::platform::CPUDeviceContext ctx(place);
auto var = scope.Var("X"); for (int i = 0; i < 2; ++i) {
auto tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto var_name = paddle::string::Sprintf("x%d", i);
tensor->Resize({10, 10}); auto var = scope.Var(var_name);
float *expect = tensor->mutable_data<float>(place); auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
for (int64_t i = 0; i < tensor->numel(); ++i) { tensor->Resize({10, 10});
expect[i] = static_cast<float>(i); float *expect = tensor->mutable_data<float>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<float>(i);
}
} }
auto out_var = scope.Var("Out"); auto out_var = scope.Var("Out");
auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>(); auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
out_tensor->Resize({10, 10}); out_tensor->Resize({10, 10});
tensor->mutable_data<float>(place); // allocate out_tensor->mutable_data<float>(place); // allocate
} }
void AddOp(const std::string &type, void AddOp(const std::string &type,
const paddle::framework::VariableNameMap &inputs, const paddle::framework::VariableNameMap &inputs,
const paddle::framework::VariableNameMap &outputs, const paddle::framework::VariableNameMap &outputs,
paddle::framework::AttributeMap attrs, paddle::framework::AttributeMap attrs,
paddle::framework::BlockDescBind *block) { paddle::framework::BlockDesc *block) {
// insert output // insert output
for (auto kv : outputs) { for (auto kv : outputs) {
for (auto v : kv.second) { for (auto v : kv.second) {
auto var = block->Var(v); auto var = block->Var(v);
var->SetDataType(paddle::framework::DataType::FP32); var->SetDataType(paddle::framework::proto::DataType::FP32);
} }
} }
...@@ -78,10 +83,10 @@ void StartServerNet() { ...@@ -78,10 +83,10 @@ void StartServerNet() {
InitTensorsInScope(scope, place); InitTensorsInScope(scope, place);
// sub program run in recv_op, for simple test we use sum // sub program run in recv_op, for simple test we use sum
paddle::framework::ProgramDescBind program; paddle::framework::ProgramDesc program;
paddle::framework::BlockDescBind *block = program.MutableBlock(0); paddle::framework::BlockDesc *block = program.MutableBlock(0);
// X for server side tensors, RX for received tensers, must be of same shape. // X for server side tensors, RX for received tensers, must be of same shape.
AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block); AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
paddle::framework::AttributeMap attrs; paddle::framework::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
...@@ -89,8 +94,8 @@ void StartServerNet() { ...@@ -89,8 +94,8 @@ void StartServerNet() {
PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto)); PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
attrs.insert({"OptimizeProgram", program_proto}); attrs.insert({"OptimizeProgram", program_proto});
recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, recv_op = paddle::framework::OpRegistry::CreateOp(
{{"Out", {"Out"}}}, attrs); "recv", {{"RX", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs);
paddle::platform::CPUDeviceContext ctx(place); paddle::platform::CPUDeviceContext ctx(place);
recv_op->Run(scope, ctx); recv_op->Run(scope, ctx);
} }
...@@ -107,11 +112,11 @@ TEST(SendRecvOp, CPU) { ...@@ -107,11 +112,11 @@ TEST(SendRecvOp, CPU) {
attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
auto send_op = paddle::framework::OpRegistry::CreateOp( auto send_op = paddle::framework::OpRegistry::CreateOp(
"send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); "send", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs);
paddle::platform::CPUDeviceContext ctx(place); paddle::platform::CPUDeviceContext ctx(place);
send_op->Run(scope, ctx); send_op->Run(scope, ctx);
auto in_var = scope.Var("X"); auto in_var = scope.Var("x0");
auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>(); auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
float *expected = tensor->data<float>(); float *expected = tensor->data<float>();
......
...@@ -67,12 +67,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -67,12 +67,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
"The level should be less than the level number of inputs.") "The level should be less than the level number of inputs.")
.SetDefault(0); .SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
The sequence_concat operator concatenates multiple LoDTensors. The sequence_concat operator concatenates multiple LoDTensors.
It only supports sequence (LoD Tensor with level number is 1) It only supports sequence (LoD Tensor with level number is 1)
or a nested sequence (LoD tensor with level number is 2) as its input. or a nested sequence (LoD tensor with level number is 2) as its input.
- Case1: - Case1:
If the axis is other than 0(here, axis is 1 and level is 1), If the axis is other than 0(here, axis is 1 and level is 1),
each input should have the same LoD information and the LoD each input should have the same LoD information and the LoD
information of the output keeps the same as the input. information of the output keeps the same as the input.
LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
...@@ -80,7 +80,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input. ...@@ -80,7 +80,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input.
LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4) LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
- Case2: - Case2:
If the axis is 0(here, leve is 0), the inputs are concatenated along If the axis is 0(here, leve is 0), the inputs are concatenated along
time steps, the LoD information of the output need to re-compute. time steps, the LoD information of the output need to re-compute.
The LoD information of level-1 should be same. The LoD information of level-1 should be same.
...@@ -124,8 +124,9 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel { ...@@ -124,8 +124,9 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker, REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
sequence_concat_grad, ops::SequenceConcatGradOp); ops::SequenceConcatOpMaker, sequence_concat_grad,
ops::SequenceConcatGradOp, false);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
sequence_concat, sequence_concat,
ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>); ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
......
...@@ -49,7 +49,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -49,7 +49,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
.AsIntermediate(); .AsIntermediate();
AddAttr<std::string>( AddAttr<std::string>(
"pooltype", "pooltype",
"(int, default AVERAGE) the pooling pooltype of SequencePoolOp.") "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
.SetDefault("AVERAGE") .SetDefault("AVERAGE")
.InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"}); .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
AddComment(R"DOC( AddComment(R"DOC(
...@@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { ...@@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
......
...@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel { ...@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
...@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel { ...@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
......
...@@ -27,11 +27,11 @@ class ShrinkRNNMemoryOp : public ArrayOp { ...@@ -27,11 +27,11 @@ class ShrinkRNNMemoryOp : public ArrayOp {
: ArrayOp(type, inputs, outputs, attrs) {} : ArrayOp(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto *x_var = scope.FindVar(Input("X")); auto *x_var = scope.FindVar(Input("X"));
PADDLE_ENFORCE(x_var != nullptr, "Input X must be set"); PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
auto &x_tensor = x_var->Get<framework::LoDTensor>(); auto &x_tensor = x_var->Get<framework::LoDTensor>();
size_t offset = this->GetOffset(scope, dev_ctx); size_t offset = this->GetOffset(scope, place);
auto *rank_table_var = scope.FindVar(Input("RankTable")); auto *rank_table_var = scope.FindVar(Input("RankTable"));
PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set"); PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
auto &rank_table = rank_table_var->Get<framework::LoDRankTable>(); auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
...@@ -93,7 +93,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -93,7 +93,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
: ArrayOp(type, inputs, outputs, attrs) {} : ArrayOp(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out"))); auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
auto *dx_var = scope.FindVar(Output(framework::GradVarName("X"))); auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr"); PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
...@@ -105,6 +105,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -105,6 +105,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
dx_tensor.Resize(x_tensor.dims()); dx_tensor.Resize(x_tensor.dims());
dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
if (dout_var == nullptr) { // dx_tensor fill zero if (dout_var == nullptr) { // dx_tensor fill zero
math::set_constant(dev_ctx, &dx_tensor, 0.0f); math::set_constant(dev_ctx, &dx_tensor, 0.0f);
} else { } else {
......
...@@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { ...@@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Logits")->type()), framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
...@@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { ...@@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType( framework::ToDataType(
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { ...@@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>(); auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
auto *out_true = auto *out_true =
...@@ -44,6 +45,9 @@ class SplitLoDTensorOp : public framework::OperatorBase { ...@@ -44,6 +45,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
auto &x_lod = x.lod(); auto &x_lod = x.lod();
auto &mask_dim = mask.dims(); auto &mask_dim = mask.dims();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(dev_place);
std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()}; std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
if (platform::is_cpu_place(mask.place())) { if (platform::is_cpu_place(mask.place())) {
cpu_mask->ShareDataWith(mask); cpu_mask->ShareDataWith(mask);
......
...@@ -82,11 +82,13 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -82,11 +82,13 @@ TEST(StridedMemcpy, GPUCrop) {
}; };
// clang-format on // clang-format on
platform::GPUPlace gpu0(0); platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src))); int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
framework::DDim src_stride({5, 1}); framework::DDim src_stride({5, 1});
...@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
framework::DDim dst_dim({2, 2}); framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1}); framework::DDim dst_stride({2, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
gpu_dst); gpu_dst);
...@@ -120,11 +121,12 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -120,11 +121,12 @@ TEST(StridedMemcpy, GPUConcat) {
}; };
// clang-format on // clang-format on
platform::GPUPlace gpu0(0); platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu; platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src))); int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src)); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
int dst[8]; int dst[8];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst))); int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
...@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
framework::DDim src_stride({2, 1}); framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2}); framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({4, 1}); framework::DDim dst_stride({4, 1});
platform::CUDADeviceContext ctx(gpu0);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst); StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
......
...@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X"); auto x_vars = ctx.MultiInputVar("X");
if (x_vars[0]->IsType<framework::LoDTensor>()) { if (x_vars[0]->IsType<framework::LoDTensor>()) {
...@@ -106,8 +106,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -106,8 +106,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Sum operator. Sum operator.
This operators sums the input tensors. All the inputs can carry the This operators sums the input tensors. All the inputs can carry the
LoD (Level of Details) information. However, the output only shares LoD (Level of Details) information. However, the output only shares
the LoD information with the first input. the LoD information with the first input.
)DOC"); )DOC");
} }
...@@ -170,7 +170,7 @@ class SumGradMaker : public framework::GradOpDescMakerBase { ...@@ -170,7 +170,7 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
using framework::GradOpDescMakerBase::GradOpDescMakerBase; using framework::GradOpDescMakerBase::GradOpDescMakerBase;
std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override { std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
auto x_grads = InputGrad("X"); auto x_grads = InputGrad("X", false);
std::vector<std::unique_ptr<framework::OpDesc>> grad_ops; std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
grad_ops.reserve(x_grads.size()); grad_ops.reserve(x_grads.size());
auto og = OutputGrad("Out"); auto og = OutputGrad("Out");
......
...@@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp { ...@@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp {
: ArrayOp(type, inputs, outputs, attrs) {} : ArrayOp(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto *x = scope.FindVar(Input("X")); auto *x = scope.FindVar(Input("X"));
if (x == nullptr) return; if (x == nullptr) return;
auto &x_tensor = x->Get<framework::LoDTensor>(); auto &x_tensor = x->Get<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx); size_t offset = GetOffset(scope, place);
auto *out = auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>(); scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
if (offset >= out->size()) { if (offset >= out->size()) {
...@@ -39,7 +39,11 @@ class WriteToArrayOp : public ArrayOp { ...@@ -39,7 +39,11 @@ class WriteToArrayOp : public ArrayOp {
} }
if (x_tensor.memory_size() > 0) { if (x_tensor.memory_size() > 0) {
auto *out_tensor = &out->at(offset); auto *out_tensor = &out->at(offset);
CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
auto &dev_ctx = *pool.Borrow(place);
CopyFrom(x_tensor, place, dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod()); out_tensor->set_lod(x_tensor.lod());
} else { } else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
...@@ -119,17 +123,18 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -119,17 +123,18 @@ class ReadFromArrayOp : public ArrayOp {
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: ArrayOp(type, inputs, outputs, attrs) {} : ArrayOp(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &place) const override {
auto *x = scope.FindVar(Input("X")); auto *x = scope.FindVar(Input("X"));
PADDLE_ENFORCE(x != nullptr, "X must be set"); PADDLE_ENFORCE(x != nullptr, "X must be set");
auto &x_array = x->Get<framework::LoDTensorArray>(); auto &x_array = x->Get<framework::LoDTensorArray>();
auto *out = scope.FindVar(Output("Out")); auto *out = scope.FindVar(Output("Out"));
PADDLE_ENFORCE(out != nullptr, "Out must be set"); PADDLE_ENFORCE(out != nullptr, "Out must be set");
auto *out_tensor = out->GetMutable<framework::LoDTensor>(); auto *out_tensor = out->GetMutable<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx); size_t offset = GetOffset(scope, place);
if (offset < x_array.size()) { if (offset < x_array.size()) {
framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx, platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
out_tensor); auto &dev_ctx = *pool.Borrow(place);
framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
out_tensor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} else { } else {
VLOG(10) << "offset " << offset << " >= " << x_array.size(); VLOG(10) << "offset " << offset << " >= " << x_array.size();
......
...@@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
......
...@@ -70,18 +70,19 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -70,18 +70,19 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
Transpose Operator. Transpose Operator.
The input tensor will be permuted according to the axis values given. The input tensor will be permuted according to the axis values given.
The op functions similar to how numpy.transpose works in python. The op functions is similar to how numpy.transpose works in python.
For example:
>> input = numpy.arange(6).reshape((2,3)) For example: input = numpy.arange(6).reshape((2,3))
>> input the input is:
array([[0, 1, 2], array([[0, 1, 2],
[3, 4, 5]]) [3, 4, 5]])
>> axis = [1, 0] given axis is: [1, 0]
>> output = input.transpose(axis)
>> output output = input.transpose(axis)
array([[0, 3], then the output is:
[1, 4], array([[0, 3],
[2, 5]]) [1, 4],
[2, 5]])
So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1}, So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
the output tensor shape will be (N, H, W, C) the output tensor shape will be (N, H, W, C)
......
...@@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { ...@@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")), static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
......
...@@ -53,16 +53,14 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -53,16 +53,14 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
"(string), unpooling type, can be \"max\" for max-unpooling ") "(string), unpooling type, can be \"max\" for max-unpooling ")
.InEnum({"max"}); .InEnum({"max"});
AddComment(R"DOC( AddComment(R"DOC(
"Input shape: $(N, C_{in}, H_{in}, W_{in})$ Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
Output shape: $(N, C_{out}, H_{out}, W_{out})$ $(N, C_{out}, H_{out}, W_{out})$, where
Where $$
$$ H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] $$
$$ Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
Paper: http://www.matthewzeiler.com/wp-content/uploads/2017 )DOC");
/07/iccv2011.pdf
)DOC");
} }
}; };
...@@ -73,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) { ...@@ -73,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
class UnpoolOp : public framework::OperatorWithKernel { class UnpoolOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
...@@ -112,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel { ...@@ -112,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel {
class UnpoolOpGrad : public framework::OperatorWithKernel { class UnpoolOpGrad : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetKernelType( framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......
...@@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase { ...@@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase {
: framework::OperatorBase(type, inputs, outputs, attrs) {} : framework::OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>(); auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
framework::Executor executor(dev_ctx); framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
auto step_scopes = auto step_scopes =
...@@ -97,8 +98,8 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -97,8 +98,8 @@ class WhileGradOp : public framework::OperatorBase {
: framework::OperatorBase(type, inputs, outputs, attrs) {} : framework::OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::Place &dev_place) const override {
framework::Executor executor(dev_ctx); framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
...@@ -189,7 +190,7 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -189,7 +190,7 @@ class WhileGradOp : public framework::OperatorBase {
auto zero_op = framework::OpRegistry::CreateOp( auto zero_op = framework::OpRegistry::CreateOp(
"fill_constant", framework::VariableNameMap{}, "fill_constant", framework::VariableNameMap{},
{{"Out", {pg_names[param_id]}}}, attrs); {{"Out", {pg_names[param_id]}}}, attrs);
zero_op->Run(scope, dev_ctx); zero_op->Run(scope, dev_place);
} }
} }
...@@ -197,7 +198,7 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -197,7 +198,7 @@ class WhileGradOp : public framework::OperatorBase {
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
sum_op->Run(cur_scope, dev_ctx); sum_op->Run(cur_scope, dev_place);
cur_scope.Rename(new_inside_name, inside_grad_name); cur_scope.Rename(new_inside_name, inside_grad_name);
} }
} }
......
...@@ -25,7 +25,7 @@ ENDIF() ...@@ -25,7 +25,7 @@ ENDIF()
# avoiding cycle dependencies # avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS}) system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
......
...@@ -22,23 +22,7 @@ namespace paddle { ...@@ -22,23 +22,7 @@ namespace paddle {
namespace platform { namespace platform {
void CudaProfilerInit(std::string output_file, std::string output_mode, void CudaProfilerInit(std::string output_file, std::string output_mode,
std::vector<std::string> config_flags) { std::string config_file) {
std::array<char, 128> buf;
std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
memcpy(buf.data(), tmpl.data(), tmpl.size());
auto result = mktemp(buf.data());
PADDLE_ENFORCE(strlen(result) != 0);
std::string config_file = result;
{
std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
for (const auto& line : config_flags) {
ofs << line << std::endl;
}
}
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
PADDLE_ENFORCE( PADDLE_ENFORCE(
......
...@@ -15,6 +15,59 @@ limitations under the License. */ ...@@ -15,6 +15,59 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
DeviceContextPool* DeviceContextPool::pool = nullptr;
const platform::DeviceContext* DeviceContextPool::Borrow(
const platform::Place& place) {
auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
return it->second;
}
std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
std::vector<const platform::DeviceContext*> borrowed_contexts;
for (auto& place : places) {
auto it = device_contexts_.find(place);
if (it != device_contexts_.end()) {
borrowed_contexts.emplace_back(it->second);
} else {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
}
return borrowed_contexts;
}
DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) {
if (platform::is_cpu_place(places[i])) {
device_contexts_.emplace(places[i],
new platform::CPUDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
} else if (platform::is_gpu_place(places[i])) {
#ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(places[i],
new platform::CUDADeviceContext(
boost::get<platform::CUDAPlace>(places[i])));
#else
PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
"option");
#endif
}
}
}
CPUDeviceContext::CPUDeviceContext() { CPUDeviceContext::CPUDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice()); eigen_device_.reset(new Eigen::DefaultDevice());
} }
...@@ -38,7 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -38,7 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
} }
~EigenCudaStreamDevice() override {} ~EigenCudaStreamDevice() override {}
void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) { void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
stream_ = cuda_stream; stream_ = cuda_stream;
place_ = place; place_ = place;
device_prop_ = &Eigen::m_deviceProperties[place.device]; device_prop_ = &Eigen::m_deviceProperties[place.device];
...@@ -77,14 +130,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -77,14 +130,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
} }
private: private:
GPUPlace place_; CUDAPlace place_;
const cudaStream_t* stream_; // not owned; const cudaStream_t* stream_; // not owned;
const cudaDeviceProp* device_prop_; // not owned; const cudaDeviceProp* device_prop_; // not owned;
mutable void* scratch_; mutable void* scratch_;
mutable unsigned int* semaphore_; mutable unsigned int* semaphore_;
}; };
CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
...@@ -125,20 +178,18 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } ...@@ -125,20 +178,18 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudaStream_t CUDADeviceContext::stream() const { return stream_; } cudaStream_t CUDADeviceContext::stream() const { return stream_; }
CUDNNDeviceContext::CUDNNDeviceContext(CUDNNPlace place) CUDNNDeviceContext::CUDNNDeviceContext(CUDAPlace place)
: CUDADeviceContext(place), place_(place) { : CUDADeviceContext(place) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream())); PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream()));
} }
CUDNNDeviceContext::~CUDNNDeviceContext() { CUDNNDeviceContext::~CUDNNDeviceContext() {
SetDeviceId(place_.device); SetDeviceId(boost::get<CUDAPlace>(GetPlace()).device);
Wait(); Wait();
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
} }
Place CUDNNDeviceContext::GetPlace() const { return CUDNNPlace(); }
cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; } cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; }
#endif #endif
......
...@@ -11,8 +11,8 @@ limitations under the License. */ ...@@ -11,8 +11,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/platform/enforce.h" #include <memory>
#include "paddle/platform/place.h" #include <unordered_map>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/platform/dynload/cublas.h" #include "paddle/platform/dynload/cublas.h"
...@@ -20,10 +20,13 @@ limitations under the License. */ ...@@ -20,10 +20,13 @@ limitations under the License. */
#include "paddle/platform/gpu_info.h" #include "paddle/platform/gpu_info.h"
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#endif #endif
#include <memory>
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
#include "glog/logging.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -55,7 +58,7 @@ class EigenCudaStreamDevice; ...@@ -55,7 +58,7 @@ class EigenCudaStreamDevice;
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public DeviceContext {
public: public:
explicit CUDADeviceContext(GPUPlace place); explicit CUDADeviceContext(CUDAPlace place);
virtual ~CUDADeviceContext(); virtual ~CUDADeviceContext();
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
...@@ -77,7 +80,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -77,7 +80,7 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream() const; cudaStream_t stream() const;
private: private:
GPUPlace place_; CUDAPlace place_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
...@@ -89,21 +92,63 @@ class CUDADeviceContext : public DeviceContext { ...@@ -89,21 +92,63 @@ class CUDADeviceContext : public DeviceContext {
class CUDNNDeviceContext : public CUDADeviceContext { class CUDNNDeviceContext : public CUDADeviceContext {
public: public:
explicit CUDNNDeviceContext(CUDNNPlace place); explicit CUDNNDeviceContext(CUDAPlace place);
virtual ~CUDNNDeviceContext(); virtual ~CUDNNDeviceContext();
/*! \brief Return place in the device context. */
Place GetPlace() const final;
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle() const; cudnnHandle_t cudnn_handle() const;
private: private:
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
CUDNNPlace place_;
}; };
#endif #endif
/*! \brief device context pool singleton */
class DeviceContextPool {
public:
explicit DeviceContextPool(const std::vector<platform::Place>& places);
static DeviceContextPool& Get() {
PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
return *pool;
}
/*! \brief Create should only called by Init function */
static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
return *pool;
}
/*! \brief Return handle of single device context. */
const platform::DeviceContext* Borrow(const platform::Place& place);
/*! \brief Return handle of multi-device context. */
std::vector<const platform::DeviceContext*> Borrow(
const std::vector<platform::Place>& places);
~DeviceContextPool() {}
private:
static DeviceContextPool* pool;
constexpr static int LEFT_SHIFT = 8;
struct Hash {
std::hash<int> hash_;
size_t operator()(const platform::Place& place) const {
int pre_hash = place.which() + (1 << LEFT_SHIFT);
if (platform::is_gpu_place(place)) {
pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
}
return hash_(pre_hash);
}
};
std::unordered_map<const platform::Place, const platform::DeviceContext*,
Hash>
device_contexts_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/platform/device_context.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/platform/device_context.h"
#include "glog/logging.h"
TEST(Device, Init) { TEST(Device, Init) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
delete device_context; delete device_context;
...@@ -31,11 +33,11 @@ TEST(Device, Init) { ...@@ -31,11 +33,11 @@ TEST(Device, Init) {
TEST(Device, CUDADeviceContext) { TEST(Device, CUDADeviceContext) {
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
...@@ -49,12 +51,11 @@ TEST(Device, CUDADeviceContext) { ...@@ -49,12 +51,11 @@ TEST(Device, CUDADeviceContext) {
TEST(Device, CUDNNDeviceContext) { TEST(Device, CUDNNDeviceContext) {
using paddle::platform::CUDNNDeviceContext; using paddle::platform::CUDNNDeviceContext;
using paddle::platform::CUDNNPlace; using paddle::platform::CUDAPlace;
if (paddle::platform::dynload::HasCUDNN()) { if (paddle::platform::dynload::HasCUDNN()) {
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
CUDNNDeviceContext* device_context = CUDNNDeviceContext* device_context = new CUDNNDeviceContext(CUDAPlace(i));
new CUDNNDeviceContext(CUDNNPlace(i));
cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
ASSERT_NE(nullptr, cudnn_handle); ASSERT_NE(nullptr, cudnn_handle);
ASSERT_NE(nullptr, device_context->stream()); ASSERT_NE(nullptr, device_context->stream());
...@@ -62,3 +63,54 @@ TEST(Device, CUDNNDeviceContext) { ...@@ -62,3 +63,54 @@ TEST(Device, CUDNNDeviceContext) {
} }
} }
} }
TEST(Device, DeviceContextPool) {
using paddle::platform::DeviceContextPool;
using paddle::platform::CUDADeviceContext;
using paddle::platform::Place;
using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace;
DeviceContextPool& pool = DeviceContextPool::Get();
auto cpu_dev_ctx1 = pool.Borrow(CPUPlace());
auto cpu_dev_ctx2 = pool.Borrow(CPUPlace());
EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1);
std::vector<Place> gpu_places;
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
gpu_places.emplace_back(CUDAPlace(i));
}
auto dev_ctxs = pool.Borrow(gpu_places);
for (size_t i = 0; i < dev_ctxs.size(); ++i) {
auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
// check same as CUDAPlace(i)
CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
}
}
int main(int argc, char** argv) {
int dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
"device count is "
<< dev_count;
return 0;
}
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
}
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
...@@ -63,6 +63,8 @@ extern void LoadNCCLDSO(); ...@@ -63,6 +63,8 @@ extern void LoadNCCLDSO();
__macro(ncclAllReduce); \ __macro(ncclAllReduce); \
__macro(ncclBcast); \ __macro(ncclBcast); \
__macro(ncclAllGather); \ __macro(ncclAllGather); \
__macro(ncclGroupStart); \
__macro(ncclGroupEnd); \
__macro(ncclReduce); \ __macro(ncclReduce); \
__macro(ncclGetErrorString); __macro(ncclGetErrorString);
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include "paddle/platform/macros.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
#include "paddle/string/to_string.h" #include "paddle/string/to_string.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/platform/device_context.h"
namespace paddle {
namespace platform {
template <typename DeviceContext>
struct ForRange {
ForRange(const DeviceContext& dev_ctx, size_t limit);
template <typename Function>
void operator()(Function func) const;
};
template <>
struct ForRange<CPUDeviceContext> {
ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
template <typename Function>
void operator()(Function func) const {
for (size_t i = 0; i < limit_; ++i) {
func(i);
}
}
size_t limit_;
};
#ifdef __NVCC__
template <typename Function>
__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
size_t idx = static_cast<size_t>(threadIdx.x);
func(idx);
}
template <typename Function>
__global__ static void ForRangeElemwiseOp(Function func, int limit) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx < limit) {
func(idx);
}
}
template <>
struct ForRange<CUDADeviceContext> {
ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
: dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
template <typename Function>
inline void operator()(Function func) const {
constexpr size_t num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads;
int grid_size = (limit_ + num_threads - 1) / num_threads;
if (grid_size == 1) {
ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
func);
} else {
ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
func, limit_);
}
}
const CUDADeviceContext& dev_ctx_;
int limit_;
};
#endif
} // namespace platform
} // namespace paddle
...@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count, ...@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
} }
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
// note: cudaMemcpy may actually be asynchronous with respect to the caller,
// block on stream 0 to make sure the copy has completed
PADDLE_ENFORCE(
cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
}
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream) { size_t count, cudaStream_t stream) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
......
...@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize(); ...@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream); enum cudaMemcpyKind kind, cudaStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind);
//! Copy memory from one device to another device. //! Copy memory from one device to another device.
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream); size_t count, cudaStream_t stream);
......
...@@ -12,17 +12,19 @@ ...@@ -12,17 +12,19 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/device_vector.h>
#include <memory>
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include "paddle/platform/dynload/nccl.h" #include "paddle/platform/dynload/nccl.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/gpu_info.h" #include "paddle/platform/gpu_info.h"
#include <thrust/device_vector.h>
#include <memory>
#include <vector>
static int dev_count = 0; static int dev_count = 0;
namespace paddle { namespace paddle {
...@@ -31,7 +33,8 @@ namespace platform { ...@@ -31,7 +33,8 @@ namespace platform {
TEST(NCCL, init) { TEST(NCCL, init) {
std::vector<ncclComm_t> comms; std::vector<ncclComm_t> comms;
comms.resize(dev_count); comms.resize(dev_count);
dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
for (int i = 0; i < dev_count; ++i) { for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]); dynload::ncclCommDestroy(comms[i]);
} }
...@@ -47,7 +50,7 @@ struct PerThreadData { ...@@ -47,7 +50,7 @@ struct PerThreadData {
T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) { PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
send_buff.resize(size); send_buff.resize(size);
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
send_buff[i] = static_cast<T>(i); send_buff[i] = static_cast<T>(i);
...@@ -131,6 +134,18 @@ int main(int argc, char** argv) { ...@@ -131,6 +134,18 @@ int main(int argc, char** argv) {
<< dev_count; << dev_count;
return 0; return 0;
} }
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
}
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }
...@@ -23,8 +23,9 @@ class PlacePrinter : public boost::static_visitor<> { ...@@ -23,8 +23,9 @@ class PlacePrinter : public boost::static_visitor<> {
public: public:
explicit PlacePrinter(std::ostream &os) : os_(os) {} explicit PlacePrinter(std::ostream &os) : os_(os) {}
void operator()(const CPUPlace &) { os_ << "CPUPlace"; } void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; } void operator()(const CUDAPlace &p) {
void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; } os_ << "CUDAPlace(" << p.device << ")";
}
private: private:
std::ostream &os_; std::ostream &os_;
...@@ -37,20 +38,14 @@ static Place the_default_place; ...@@ -37,20 +38,14 @@ static Place the_default_place;
void set_place(const Place &place) { the_default_place = place; } void set_place(const Place &place) { the_default_place = place; }
const Place &get_place() { return the_default_place; } const Place &get_place() { return the_default_place; }
const GPUPlace default_gpu() { return GPUPlace(0); } const CUDAPlace default_gpu() { return CUDAPlace(0); }
const CPUPlace default_cpu() { return CPUPlace(); } const CPUPlace default_cpu() { return CPUPlace(); }
const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); }
bool is_gpu_place(const Place &p) { bool is_gpu_place(const Place &p) {
return boost::apply_visitor(IsGPUPlace(), p); return boost::apply_visitor(IsCUDAPlace(), p);
}
bool is_cpu_place(const Place &p) {
return !is_gpu_place(p) && !is_mkldnn_place(p);
} }
bool is_mkldnn_place(const Place &p) { bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
return boost::apply_visitor(IsMKLDNNPlace(), p);
}
bool places_are_same_class(const Place &p1, const Place &p2) { bool places_are_same_class(const Place &p1, const Place &p2) {
return p1.which() == p2.which(); return p1.which() == p2.which();
......
...@@ -31,65 +31,35 @@ struct CPUPlace { ...@@ -31,65 +31,35 @@ struct CPUPlace {
inline bool operator!=(const CPUPlace &) const { return false; } inline bool operator!=(const CPUPlace &) const { return false; }
}; };
struct MKLDNNPlace { struct CUDAPlace {
MKLDNNPlace() {} CUDAPlace() : CUDAPlace(0) {}
explicit CUDAPlace(int d) : device(d) {}
// needed for variant equality comparison
inline bool operator==(const MKLDNNPlace &) const { return true; }
inline bool operator!=(const MKLDNNPlace &) const { return false; }
};
struct GPUPlace {
GPUPlace() : GPUPlace(0) {}
explicit GPUPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; } inline int GetDeviceId() const { return device; }
// needed for variant equality comparison // needed for variant equality comparison
inline bool operator==(const GPUPlace &o) const { return device == o.device; } inline bool operator==(const CUDAPlace &o) const {
inline bool operator!=(const GPUPlace &o) const { return !(*this == o); } return device == o.device;
}
inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
int device; int device;
}; };
struct CUDNNPlace : public GPUPlace { struct IsCUDAPlace : public boost::static_visitor<bool> {
CUDNNPlace() : GPUPlace() {}
explicit CUDNNPlace(int d) : GPUPlace(d) {}
};
struct IsGPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const MKLDNNPlace &) const { return false; } bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const GPUPlace &gpu) const { return true; }
}; };
struct IsMKLDNNPlace : public boost::static_visitor<bool> { typedef boost::variant<CUDAPlace, CPUPlace> Place;
bool operator()(const MKLDNNPlace &) const { return true; }
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const GPUPlace &) const { return false; }
};
// Define the max number of Place in bit length. i.e., the max number of places
// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
typedef boost::variant<CUDNNPlace, GPUPlace, CPUPlace, MKLDNNPlace> Place;
// static check number of place types is less equal than
// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
BOOST_MPL_ASSERT((boost::mpl::less_equal<
Place::types::size,
boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>));
void set_place(const Place &); void set_place(const Place &);
const Place &get_place(); const Place &get_place();
const GPUPlace default_gpu(); const CUDAPlace default_gpu();
const CPUPlace default_cpu(); const CPUPlace default_cpu();
const MKLDNNPlace default_mkldnn();
bool is_gpu_place(const Place &); bool is_gpu_place(const Place &);
bool is_cpu_place(const Place &); bool is_cpu_place(const Place &);
bool is_mkldnn_place(const Place &);
bool places_are_same_class(const Place &, const Place &); bool places_are_same_class(const Place &, const Place &);
std::ostream &operator<<(std::ostream &, const Place &); std::ostream &operator<<(std::ostream &, const Place &);
......
...@@ -4,45 +4,34 @@ ...@@ -4,45 +4,34 @@
TEST(Place, Equality) { TEST(Place, Equality) {
paddle::platform::CPUPlace cpu; paddle::platform::CPUPlace cpu;
paddle::platform::GPUPlace g0(0), g1(1), gg0(0); paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0);
EXPECT_EQ(cpu, cpu); EXPECT_EQ(cpu, cpu);
EXPECT_EQ(g0, g0); EXPECT_EQ(g0, g0);
EXPECT_EQ(g1, g1); EXPECT_EQ(g1, g1);
EXPECT_EQ(g0, gg0); EXPECT_EQ(g0, gg0);
EXPECT_EQ(d0, dd0);
EXPECT_NE(g0, g1); EXPECT_NE(g0, g1);
EXPECT_NE(d0, d1);
EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0)); EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu)); EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
EXPECT_TRUE(paddle::platform::is_gpu_place(d0));
EXPECT_FALSE(paddle::platform::places_are_same_class(g0, d0));
} }
TEST(Place, Default) { TEST(Place, Default) {
EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place())); EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu())); EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu())); EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
EXPECT_TRUE(
paddle::platform::is_mkldnn_place(paddle::platform::default_mkldnn()));
EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
paddle::platform::set_place(paddle::platform::CPUPlace()); paddle::platform::set_place(paddle::platform::CPUPlace());
EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place())); EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
paddle::platform::set_place(paddle::platform::MKLDNNPlace());
EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
EXPECT_TRUE(paddle::platform::is_mkldnn_place(paddle::platform::get_place()));
} }
TEST(Place, Print) { TEST(Place, Print) {
{ {
std::stringstream ss; std::stringstream ss;
ss << paddle::platform::GPUPlace(1); ss << paddle::platform::CUDAPlace(1);
EXPECT_EQ("GPUPlace(1)", ss.str()); EXPECT_EQ("CUDAPlace(1)", ss.str());
} }
{ {
std::stringstream ss; std::stringstream ss;
......
...@@ -49,15 +49,15 @@ TEST(Transform, CPUUnary) { ...@@ -49,15 +49,15 @@ TEST(Transform, CPUUnary) {
TEST(Transform, GPUUnary) { TEST(Transform, GPUUnary) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::memory; using namespace paddle::memory;
GPUPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4)); float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans; Transform<paddle::platform::CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10)); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5); ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
...@@ -80,14 +80,14 @@ TEST(Transform, GPUBinary) { ...@@ -80,14 +80,14 @@ TEST(Transform, GPUBinary) {
using namespace paddle::platform; using namespace paddle::platform;
using namespace paddle::memory; using namespace paddle::memory;
int buf[4] = {1, 2, 3, 4}; int buf[4] = {1, 2, 3, 4};
GPUPlace gpu0(0); CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0); CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf))); int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
Transform<paddle::platform::CUDADeviceContext> trans; Transform<paddle::platform::CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>()); trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait(); ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
Free(gpu0, gpu_buf); Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]); ASSERT_EQ((i + 1) * (i + 1), buf[i]);
......
...@@ -23,6 +23,11 @@ void BindConstValue(pybind11::module& m) { ...@@ -23,6 +23,11 @@ void BindConstValue(pybind11::module& m) {
m.def("kTempVarName", [] { return framework::kTempVarName; }); m.def("kTempVarName", [] { return framework::kTempVarName; });
m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
// for kernel_hint key
m.def("kUseCPU", [] { return framework::kUseCPU; });
m.def("kUseCUDNN", [] { return framework::kUseCUDNN; });
m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; });
} }
} // namespace pybind } // namespace pybind
......
...@@ -159,6 +159,7 @@ void BindBlockDesc(py::module &m) { ...@@ -159,6 +159,7 @@ void BindBlockDesc(py::module &m) {
py::return_value_policy::reference) py::return_value_policy::reference)
.def("prepend_op", &BlockDesc::PrependOp, .def("prepend_op", &BlockDesc::PrependOp,
py::return_value_policy::reference) py::return_value_policy::reference)
.def("remove_op", &BlockDesc::RemoveOp)
.def("var", .def("var",
[](BlockDesc &self, py::bytes byte_name) { [](BlockDesc &self, py::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
...@@ -249,6 +250,12 @@ void BindOpDesc(py::module &m) { ...@@ -249,6 +250,12 @@ void BindOpDesc(py::module &m) {
.def("set_attr", &OpDesc::SetAttr) .def("set_attr", &OpDesc::SetAttr)
.def("attr", &OpDesc::GetAttr) .def("attr", &OpDesc::GetAttr)
.def("set_block_attr", &OpDesc::SetBlockAttr) .def("set_block_attr", &OpDesc::SetBlockAttr)
.def("set_serialized_attr",
[](OpDesc &self, const std::string &name,
const py::bytes &seriralized) {
std::string ser(seriralized);
self.SetAttr(name, ser);
})
.def("block_attr", &OpDesc::GetBlockAttr) .def("block_attr", &OpDesc::GetBlockAttr)
.def("check_attrs", &OpDesc::CheckAttrs) .def("check_attrs", &OpDesc::CheckAttrs)
.def("infer_shape", &OpDesc::InferShape) .def("infer_shape", &OpDesc::InferShape)
......
...@@ -79,7 +79,7 @@ PYBIND11_PLUGIN(core) { ...@@ -79,7 +79,7 @@ PYBIND11_PLUGIN(core) {
self.Resize(make_ddim(dim)); self.Resize(make_ddim(dim));
}) })
.def("alloc_float", .def("alloc_float",
[](Tensor &self, paddle::platform::GPUPlace &place) { [](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("alloc_float", .def("alloc_float",
...@@ -91,7 +91,7 @@ PYBIND11_PLUGIN(core) { ...@@ -91,7 +91,7 @@ PYBIND11_PLUGIN(core) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("alloc_int", .def("alloc_int",
[](Tensor &self, paddle::platform::GPUPlace &place) { [](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("set", PyCPUTensorSetFromArray<float>) .def("set", PyCPUTensorSetFromArray<float>)
...@@ -310,10 +310,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -310,10 +310,10 @@ All parameter, weight, gradient are variables in Paddle.
return new paddle::platform::CPUDeviceContext(); return new paddle::platform::CPUDeviceContext();
}) })
.def_static("create", .def_static("create",
[](paddle::platform::GPUPlace& place) [](paddle::platform::CUDAPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("GPUPlace is not supported in CPU device."); PADDLE_THROW("CUDAPlace is not supported in CPU device.");
#else #else
return new paddle::platform::CUDADeviceContext(place); return new paddle::platform::CUDADeviceContext(place);
#endif #endif
...@@ -323,9 +323,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -323,9 +323,9 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
py::class_<platform::GPUPlace>(m, "GPUPlace") py::class_<platform::CUDAPlace>(m, "CUDAPlace")
.def(py::init<int>()) .def(py::init<int>())
.def("__str__", string::to_string<const platform::GPUPlace &>); .def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace") py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
.def(py::init<>()) .def(py::init<>())
...@@ -338,7 +338,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -338,7 +338,7 @@ All parameter, weight, gradient are variables in Paddle.
self = cpu_place; self = cpu_place;
}) })
.def("set_place", .def("set_place",
[](platform::Place &self, const platform::GPUPlace &gpu_place) { [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
self = gpu_place; self = gpu_place;
}); });
...@@ -360,10 +360,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -360,10 +360,10 @@ All parameter, weight, gradient are variables in Paddle.
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self, const Scope &scope,
const platform::DeviceContext &dev_ctx) { const platform::CPUPlace &place) { self.Run(scope, place); })
self.Run(scope, dev_ctx); .def("run",
dev_ctx.Wait(); [](OperatorBase &self, const Scope &scope,
}) const platform::CUDAPlace &place) { self.Run(scope, place); })
.def("type", .def("type",
[](const OperatorBase &op) -> std::string { return op.Type(); }) [](const OperatorBase &op) -> std::string { return op.Type(); })
.def("outputs", .def("outputs",
...@@ -417,7 +417,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -417,7 +417,7 @@ All parameter, weight, gradient are variables in Paddle.
}); });
py::class_<framework::Executor>(m, "Executor") py::class_<framework::Executor>(m, "Executor")
.def(py::init<std::vector<platform::Place> &>()) .def(py::init<const platform::Place &>())
.def("run", &Executor::Run); .def("run", &Executor::Run);
m.def("unique_integer", UniqueIntegerGenerator); m.def("unique_integer", UniqueIntegerGenerator);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <string> #include <string>
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/memory/memcpy.h" #include "paddle/memory/memcpy.h"
#include "paddle/platform/device_context.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -61,13 +62,16 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -61,13 +62,16 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>()); auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace())); tensor.dims(), platform::CPUPlace()));
// TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
// a Python numpy array. It's better to manage CDUA stream unifiedly. platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
sizeof(CUR_TYPE) * tensor.numel(), pool.Borrow(tensor.place()));
cudaMemcpyDeviceToHost);
paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream());
#else #else
PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
} else if (paddle::platform::is_cpu_place(tensor.place())) { } else if (paddle::platform::is_cpu_place(tensor.place())) {
dst_tensor = tensor; dst_tensor = tensor;
...@@ -123,7 +127,7 @@ template <typename T> ...@@ -123,7 +127,7 @@ template <typename T>
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
framework::Tensor &self, framework::Tensor &self,
py::array_t<T, py::array::c_style | py::array::forcecast> array, py::array_t<T, py::array::c_style | py::array::forcecast> array,
paddle::platform::GPUPlace &place) { paddle::platform::CUDAPlace &place) {
std::vector<int64_t> dims; std::vector<int64_t> dims;
dims.reserve(array.ndim()); dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) { for (size_t i = 0; i < array.ndim(); ++i) {
...@@ -132,10 +136,12 @@ void PyCUDATensorSetFromArray( ...@@ -132,10 +136,12 @@ void PyCUDATensorSetFromArray(
self.Resize(framework::make_ddim(dims)); self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self.mutable_data<T>(place);
// TODO(qijun): Here we use default CUDA stream to set a Python numpy
// array to a GPU Tensor. It's better to manage CDUA stream unifiedly. platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), auto dev_ctx =
cudaMemcpyHostToDevice); static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
} }
#endif #endif
......
...@@ -5,11 +5,3 @@ configure_file(submit_local.sh.in ...@@ -5,11 +5,3 @@ configure_file(submit_local.sh.in
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ) GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
configure_file(tools/usage_stat/usage.sh
paddle_usage
@ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle_usage DESTINATION opt/paddle/bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
...@@ -165,9 +165,6 @@ case "$1" in ...@@ -165,9 +165,6 @@ case "$1" in
"make_diagram") "make_diagram")
python -m paddle.utils.make_model_diagram ${@:2} python -m paddle.utils.make_model_diagram ${@:2}
;; ;;
"usage")
$PADDLE_BIN_PATH/paddle_usage ${@:2}
;;
"version") "version")
version version
;; ;;
......
#!/bin/bash
ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code: -- "$@"`
KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
# paddle config home dir, same as paddle
PADDLE_CONF_HOME="$HOME/.config/paddle"
# api url, mirror url(s) will be append later
PD_URLS="http://api.paddlepaddle.org/version"
usage()
{
echo "Usage: `basename $0` [options]"
echo "Options:"
echo " -e, --exit-code=EXIT_CODE The train/predict process's exit code"
echo " -l, --log-file=LOG_FILE_PATH Read which log file to get the duration of process"
echo " -n, --task-name=TASK_NAME The name of demo or example"
echo " -u, --git-user=GITHUB_USER provide contact info, like username or email"
echo " -v, -i Verbose output and interact with user when necessary"
echo " --help display this help message"
}
eval set -- "${ARGPARSE}"
while true; do
case "$1" in
-l|--log-file)
log_file=$2
shift 2
;;
-e|--exit-code)
exit_code=$2
shift 2
;;
-u|--git-user)
github_user=$2
shift 2
;;
-n|--task-name)
task=$2
shift 2
;;
-v|-i)
v=1
shift
;;
--dry-run)
dry_run=1
shift
;;
--)
shift
break
;;
--help)
usage
exit 0
;;
*)
echo "Invalid option $1"
usage
exit 1
;;
esac
done
# parse the log_file to get the time costs
if [ -s "${log_file}" ]; then
duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
{if(index($2,":")==3){
t=substr($2,1,8);
sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
if(sec<last_sec-600){day+=1;sec+=86400;}
last_sec=sec;
if(min_sec==0 || min_sec>sec){min_sec=sec;}
if(max_sec==0 || max_sec<sec){max_sec=sec;}
}}
END{print max_sec-min_sec}' ${log_file}`
else
duration=-1
fi
if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
# try find the user/email if not given
if [ -z "${github_user}" ]; then
# search for cached username
if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
github_user=`cat ${PADDLE_CONF_HOME}/github_user`
else
# search the github-user from git config
if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
git_username=`git config --get user.name 2>/dev/null`
git_url=`git config --get remote.origin.url 2>/dev/null`
if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
# under a git url, like https://github.com/user_xxx/proj_yyy.git
if [ "${v}" = "1" ]; then echo " from github url..."; fi
github_user=`echo ${git_url} | cut -d "/" -f 4`
if [ "${github_user}" = "PaddlePaddle" ]; then
github_user=
fi
fi
if [ -n "${git_username}" -a -z "${github_user}" ]; then
if [ "${v}" = "1" ]; then echo " from global git username..."; fi
github_user=${git_username}
fi
fi
fi
# allow user to set the user name, if it's not found
if [ -z "${github_user}" -a "${v}" = "1" ]; then
read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
github_user=${REPLY}
if [ -z "${github_user}" ]; then
# empty input, consider as one anonymous user
github_user="${KEEP_ANONYMOUS}"
fi
fi
if [ -n "${github_user}" -a -z "${dry_run}" ]; then
# valid user and not in dry-run mode, then save to cache
mkdir -p ${PADDLE_CONF_HOME}
echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
fi
if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
# anonymous user should keep the var empty.
github_user=
fi
# read local paddle version
paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
# read local system time
system_time=`date "+%Y%m%d%H%M%S"`
if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
# make empty job_name as default value.
if [ -z "${task}" ]; then
task="(unknown_task)"
fi
if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
# concat the curl command
params="content={\"data_type\":\"usage\",\
\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
}&type=1"
curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
-b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
if [ "${dry_run}" = "1" ]; then
first_url=`echo ${PD_URLS} | cut -d " " -f 1`
echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
exit 0
else
for u in ${PD_URLS}; do
curl_cmd="${curl_cmd_prefix} ${u}"
if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
${curl_cmd} >/dev/null 2>&1
if [ $? -eq 0 ]; then
if [ "${v}" = "1" ]; then echo "upload OK!"; fi
exit 0
else
if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
fi
done
if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
exit 1
fi
...@@ -6,7 +6,6 @@ if(WITH_TESTING) ...@@ -6,7 +6,6 @@ if(WITH_TESTING)
add_library(paddle_test_util STATIC TestUtil.cpp) add_library(paddle_test_util STATIC TestUtil.cpp)
add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
add_library(paddle_gtest_main STATIC paddle_gtest_main.cc) cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
endif() endif()
endif() endif()
...@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cstring> #include <cstring>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
...@@ -32,8 +34,11 @@ int main(int argc, char** argv) { ...@@ -32,8 +34,11 @@ int main(int argc, char** argv) {
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
paddle::memory::Used(paddle::platform::CPUPlace()); paddle::memory::Used(paddle::platform::CPUPlace());
std::vector<std::string> devs = {"CPU"};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::memory::Used(paddle::platform::GPUPlace(0)); paddle::memory::Used(paddle::platform::CUDAPlace(0));
devs.push_back("GPU:0");
#endif #endif
paddle::framework::InitDevices(devs);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }
...@@ -270,7 +270,7 @@ class LayerType(object): ...@@ -270,7 +270,7 @@ class LayerType(object):
@staticmethod @staticmethod
def is_layer_type(type_name): def is_layer_type(type_name):
""" """
If type_name is a layer type. Whether type_name is a layer type.
:param type_name: layer type name. Because layer type enumerations are :param type_name: layer type name. Because layer type enumerations are
strings. strings.
...@@ -441,7 +441,7 @@ def full_matrix_projection(input, size=0, param_attr=None): ...@@ -441,7 +441,7 @@ def full_matrix_projection(input, size=0, param_attr=None):
with mixed_layer(size=100) as m: with mixed_layer(size=100) as m:
m += full_matrix_projection(input=layer) m += full_matrix_projection(input=layer)
2. When used as an independant object like this, you must set the size: 2. When used as an independent object like this, you must set the size:
.. code-block:: python .. code-block:: python
...@@ -451,11 +451,11 @@ def full_matrix_projection(input, size=0, param_attr=None): ...@@ -451,11 +451,11 @@ def full_matrix_projection(input, size=0, param_attr=None):
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param size: The parameter size. Means the width of parameter. :param size: The dimension of this layer.
:type size: int :type size: int
:param param_attr: Parameter config, None if use default. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:return: A FullMatrixProjection Object. :return: FullMatrixProjection Object.
:rtype: FullMatrixProjection :rtype: FullMatrixProjection
""" """
proj = FullMatrixProjection( proj = FullMatrixProjection(
...@@ -468,12 +468,12 @@ def full_matrix_projection(input, size=0, param_attr=None): ...@@ -468,12 +468,12 @@ def full_matrix_projection(input, size=0, param_attr=None):
def trans_full_matrix_projection(input, size=0, param_attr=None): def trans_full_matrix_projection(input, size=0, param_attr=None):
""" """
Different from full_matrix_projection, this projection performs matrix Different from full_matrix_projection, this projection performs matrix
multiplication, using transpose of weight. multiplication, using the transpose of weight.
.. math:: .. math::
out.row[i] += in.row[i] * w^\mathrm{T} out.row[i] += in.row[i] * w^\mathrm{T}
:math:`w^\mathrm{T}` means transpose of weight. :math:`w^\mathrm{T}` means the transpose of weight.
The simply usage is: The simply usage is:
.. code-block:: python .. code-block:: python
...@@ -489,9 +489,9 @@ def trans_full_matrix_projection(input, size=0, param_attr=None): ...@@ -489,9 +489,9 @@ def trans_full_matrix_projection(input, size=0, param_attr=None):
:type input: LayerOutput :type input: LayerOutput
:param size: The parameter size. Means the width of parameter. :param size: The parameter size. Means the width of parameter.
:type size: int :type size: int
:param param_attr: Parameter config, None if use default. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:return: A TransposedFullMatrixProjection Object. :return: TransposedFullMatrixProjection Object.
:rtype: TransposedFullMatrixProjection :rtype: TransposedFullMatrixProjection
""" """
proj = TransposedFullMatrixProjection( proj = TransposedFullMatrixProjection(
...@@ -521,7 +521,7 @@ def table_projection(input, size=0, param_attr=None): ...@@ -521,7 +521,7 @@ def table_projection(input, size=0, param_attr=None):
with mixed_layer(size=100) as m: with mixed_layer(size=100) as m:
m += table_projection(input=layer) m += table_projection(input=layer)
2. When used as an independant object like this, you must set the size: 2. When used as an independent object like this, you must set the size:
.. code-block:: python .. code-block:: python
...@@ -532,11 +532,11 @@ def table_projection(input, size=0, param_attr=None): ...@@ -532,11 +532,11 @@ def table_projection(input, size=0, param_attr=None):
:param input: The input of this layer, which must contains id fields. :param input: The input of this layer, which must contains id fields.
:type input: LayerOutput :type input: LayerOutput
:param size: The parameter size. Means the width of parameter. :param size: The dimension of the output.
:type size: int :type size: int
:param param_attr: Parameter config, None if use default. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:return: A TableProjection Object. :return: TableProjection Object.
:rtype: TableProjection :rtype: TableProjection
""" """
proj = TableProjection( proj = TableProjection(
...@@ -547,7 +547,7 @@ def table_projection(input, size=0, param_attr=None): ...@@ -547,7 +547,7 @@ def table_projection(input, size=0, param_attr=None):
def identity_projection(input, offset=None, size=None): def identity_projection(input, offset=None, size=None):
""" """
1. IdentityProjection if offset=None. It performs: 1. If offset=None, it performs IdentityProjection as follows:
.. math:: .. math::
out.row[i] += in.row[i] out.row[i] += in.row[i]
...@@ -559,9 +559,8 @@ def identity_projection(input, offset=None, size=None): ...@@ -559,9 +559,8 @@ def identity_projection(input, offset=None, size=None):
proj = identity_projection(input=layer) proj = identity_projection(input=layer)
2. IdentityOffsetProjection if offset!=None. It likes IdentityProjection, 2. If offset!=None, It executes IdentityOffsetProjection and takes the
but layer size may be smaller than input size. elements of the input in the range [offset, offset+size) as output.
It select dimesions [offset, offset+layer_size) from input:
.. math:: .. math::
out.row[i] += in.row[i + \\textrm{offset}] out.row[i] += in.row[i + \\textrm{offset}]
...@@ -573,14 +572,20 @@ def identity_projection(input, offset=None, size=None): ...@@ -573,14 +572,20 @@ def identity_projection(input, offset=None, size=None):
proj = identity_projection(input=layer, proj = identity_projection(input=layer,
offset=10) offset=10)
Note that both of two projections should not have any parameter. Note that neither of the projections have trainable parameter.
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param offset: Offset, None if use default. :param offset: The offset from the start of the input. The input's
elements in the range [offset, offset+size) will be
taken as output. If this parameter is not set or set
to None, the output will be the same as the input.
:type offset: int :type offset: int
:return: A IdentityProjection or IdentityOffsetProjection object :param size: The dimension of this layer. It will be neglected
:rtype: IdentityProjection or IdentityOffsetProjection when offset is None or not set.
:type size: int
:return: IdentityProjection or IdentityOffsetProjection object
:rtype: IdentityProjection | IdentityOffsetProjection
""" """
if offset is None: if offset is None:
proj = IdentityProjection(input_layer_name=input.name) proj = IdentityProjection(input_layer_name=input.name)
...@@ -596,8 +601,8 @@ def identity_projection(input, offset=None, size=None): ...@@ -596,8 +601,8 @@ def identity_projection(input, offset=None, size=None):
def slice_projection(input, slices): def slice_projection(input, slices):
""" """
slice_projection can slice the input value into multiple parts, slice_projection slices the input value into multiple parts,
and then select some of them to merge into a new output. then selects and merges some of them into a new output.
.. math:: .. math::
output = [input.slices()] output = [input.slices()]
...@@ -608,15 +613,13 @@ def slice_projection(input, slices): ...@@ -608,15 +613,13 @@ def slice_projection(input, slices):
proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)]) proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
Note that slice_projection should not have any parameter. Note that slice_projection has no trainable parameter.
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param slices: An array of slice parameters. :param slices: A list of start and end offsets of each slice.
Each slice contains the start and end offsets based :type slices: list of tuple
on the input. :return: SliceProjection object.
:type slices: pair of int
:return: A SliceProjection object
:rtype: SliceProjection :rtype: SliceProjection
""" """
assert len(slices) >= 1 assert len(slices) >= 1
...@@ -636,8 +639,7 @@ def slice_projection(input, slices): ...@@ -636,8 +639,7 @@ def slice_projection(input, slices):
@wrap_param_attr_default() @wrap_param_attr_default()
def scaling_projection(input, param_attr=None): def scaling_projection(input, param_attr=None):
""" """
scaling_projection multiplies the input with a scalar parameter and add to scaling_projection multiplies the input with a scalar parameter.
the output.
.. math:: .. math::
out += w * in out += w * in
...@@ -650,9 +652,9 @@ def scaling_projection(input, param_attr=None): ...@@ -650,9 +652,9 @@ def scaling_projection(input, param_attr=None):
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param param_attr: Parameter config, None if use default. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:return: A ScalingProjection object :return: ScalingProjection object.
:rtype: ScalingProjection :rtype: ScalingProjection
""" """
proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr) proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
...@@ -663,8 +665,8 @@ def scaling_projection(input, param_attr=None): ...@@ -663,8 +665,8 @@ def scaling_projection(input, param_attr=None):
@wrap_param_attr_default() @wrap_param_attr_default()
def dotmul_projection(input, param_attr=None): def dotmul_projection(input, param_attr=None):
""" """
DotMulProjection with a layer as input. DotMulProjection takes a layer as input and performs
It performs element-wise multiplication with weight. element-wise multiplication with weight.
.. math:: .. math::
out.row[i] += in.row[i] .* weight out.row[i] += in.row[i] .* weight
...@@ -679,9 +681,9 @@ def dotmul_projection(input, param_attr=None): ...@@ -679,9 +681,9 @@ def dotmul_projection(input, param_attr=None):
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param param_attr: Parameter config, None if use default. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:return: A DotMulProjection Object. :return: DotMulProjection object.
:rtype: DotMulProjection :rtype: DotMulProjection
""" """
proj = DotMulProjection( proj = DotMulProjection(
...@@ -698,7 +700,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs): ...@@ -698,7 +700,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
out.row[i] += scale * (a.row[i] .* b.row[i]) out.row[i] += scale * (a.row[i] .* b.row[i])
where :math:`.*` means element-wise multiplication, and where :math:`.*` means element-wise multiplication, and
scale is a config scalar, its default value is one. scale is a config scalar, its default value is 1.
The example usage is: The example usage is:
...@@ -706,13 +708,13 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs): ...@@ -706,13 +708,13 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
op = dotmul_operator(a=layer1, b=layer2, scale=0.5) op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
:param a: Input layer1 :param a: The first input of this layer.
:type a: LayerOutput :type a: LayerOutput
:param b: Input layer2 :param b: The second input of this layer.
:type b: LayerOutput :type b: LayerOutput
:param scale: config scalar, default value is one. :param scale: A scalar to scale the product. Its default value is 1.
:type scale: float :type scale: float
:return: A DotMulOperator Object. :return: DotMulOperator object.
:rtype: DotMulOperator :rtype: DotMulOperator
""" """
if 'x' in kwargs or 'y' in kwargs: if 'x' in kwargs or 'y' in kwargs:
...@@ -738,28 +740,29 @@ def context_projection(input, ...@@ -738,28 +740,29 @@ def context_projection(input,
""" """
Context Projection. Context Projection.
It just simply reorganizes input sequence, combines "context_len" sequence It just reorganizes input sequence, combines "context_len" elements of the
to one context from context_start. "context_start" will be set to sequence to one context from context_start. "context_start" will be set to
-(context_len - 1) / 2 by default. If context position out of sequence -(context_len - 1) / 2 by default. When context position is out of sequence
length, padding will be filled as zero if padding_attr = False, otherwise length, padding will be filled as zero if padding_attr = False, otherwise
it is trainable. it is trainable.
For example, origin sequence is [A B C D E F G], context len is 3, then For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
after context projection and not set padding_attr, sequence will is not set, then after context projection, sequence will
be [ 0AB ABC BCD CDE DEF EFG FG0 ]. be [ 0AB ABC BCD CDE DEF EFG FG0 ].
:param input: The input of this layer, which should be a sequence. :param input: The input of this layer, which should be a sequence.
:type input: LayerOutput :type input: LayerOutput
:param context_len: context length. :param context_len: The length of the context.
:type context_len: int :type context_len: int
:param context_start: context start position. Default is :param context_start: The start position of the context. The default value is
-(context_len - 1)/2 -(context_len - 1)/2
:type context_start: int :type context_start: int
:param padding_attr: Padding Parameter Attribute. If false, it means padding :param padding_attr: Parameter attribute of the padding. If the parameter is
always be zero. Otherwise Padding is learnable, and set to False, padding will be zero. In other cases, the
parameter attribute is set by this parameter. padding is trainable, and its parameter attribute is set
by this parameter.
:type padding_attr: bool | ParameterAttribute :type padding_attr: bool | ParameterAttribute
:return: Projection :return: Projection object.
:rtype: Projection :rtype: Projection
""" """
context_start = -( context_start = -(
...@@ -791,10 +794,9 @@ class MixedLayerType(LayerOutput): ...@@ -791,10 +794,9 @@ class MixedLayerType(LayerOutput):
def __init__(self, name, size, act, bias_attr, layer_attr, parents=None): def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
""" """
Ctor. :param name: The name of this layer.
:param name: layer name.
:type name: basestring :type name: basestring
:param size: layer size. :param size: The dimension of this layer.
:type size: int :type size: int
:param act: Activation type. :param act: Activation type.
:type act: BaseActivation :type act: BaseActivation
...@@ -802,8 +804,9 @@ class MixedLayerType(LayerOutput): ...@@ -802,8 +804,9 @@ class MixedLayerType(LayerOutput):
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
parameter is set to True, the bias is initialized to zero. parameter is set to True, the bias is initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any :type bias_attr: ParameterAttribute | None | bool | Any
:param layer_attr: Extra Layer Attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
:type layer_attr: ExtraLayerAttribute or None details.
:type layer_attr: ExtraLayerAttribute | None
""" """
LayerOutput.__init__( LayerOutput.__init__(
self, self,
...@@ -868,12 +871,12 @@ def mixed_layer(size=0, ...@@ -868,12 +871,12 @@ def mixed_layer(size=0,
bias_attr=False, bias_attr=False,
layer_attr=None): layer_attr=None):
""" """
Mixed Layer. A mixed layer will add all inputs together, then activate. Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
Each inputs is a projection or operator. Each input is a projection or operator.
There are two styles of usages. There are two styles of usages.
1. When not set inputs parameter, use mixed_layer like this: 1. When the parameter input is not set, use mixed_layer like this:
.. code-block:: python .. code-block:: python
...@@ -889,21 +892,21 @@ def mixed_layer(size=0, ...@@ -889,21 +892,21 @@ def mixed_layer(size=0,
input=[full_matrix_projection(input=layer1), input=[full_matrix_projection(input=layer1),
full_matrix_projection(input=layer2)]) full_matrix_projection(input=layer2)])
:param name: mixed layer name. Can be referenced by other layer. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param size: layer size. :param size: The dimension of this layer.
:type size: int :type size: int
:param input: The input of this layer. It is an optional parameter. If set, :param input: The input of this layer. It is an optional parameter.
then this function will just return layer's name.
:param act: Activation Type. LinearActivation is the default activation. :param act: Activation Type. LinearActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
:param bias_attr: The bias attribute. If the parameter is set to False or an object :param bias_attr: The bias attribute. If the parameter is set to False or an object
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
parameter is set to True, the bias is initialized to zero. parameter is set to True, the bias is initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any :type bias_attr: ParameterAttribute | None | bool | Any
:param layer_attr: The extra layer config. Default is None. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: MixedLayerType object can add inputs or layer name. :return: MixedLayerType object.
:rtype: MixedLayerType :rtype: MixedLayerType
""" """
...@@ -938,14 +941,15 @@ def data_layer(name, size, depth=None, height=None, width=None, ...@@ -938,14 +941,15 @@ def data_layer(name, size, depth=None, height=None, width=None,
:param name: The name of this layer. :param name: The name of this layer.
:type name: basestring :type name: basestring
:param size: Size of this data layer. :param size: The dimension of this data layer.
:type size: int :type size: int
:param height: Height of this data layer, used for image :param height: The height of the input image data.
:type height: int | None :type height: int | None
:param width: Width of this data layer, used for image :param width: The width of the input image data.
:type width: int | None :type width: int | None
:param layer_attr: Extra Layer Attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
:type layer_attr: ExtraLayerAttribute. details.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -978,14 +982,15 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None): ...@@ -978,14 +982,15 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input: The input of this layer, which must be Index Data. :param input: The input of this layer, whose type must be Index Data.
:type input: LayerOutput :type input: LayerOutput
:param size: The embedding dimension. :param size: The dimension of the embedding vector.
:type size: int :type size: int
:param param_attr: The embedding parameter attribute. See ParameterAttribute :param param_attr: The embedding parameter attribute. See ParameterAttribute
for details. for details.
:type param_attr: ParameterAttribute | None :type param_attr: ParameterAttribute
:param layer_attr: Extra layer Config. Default is None. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute | None :type layer_attr: ExtraLayerAttribute | None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -1013,7 +1018,7 @@ def fc_layer(input, ...@@ -1013,7 +1018,7 @@ def fc_layer(input,
bias_attr=None, bias_attr=None,
layer_attr=None): layer_attr=None):
""" """
Helper for declare fully connected layer. The fully connected layer.
The example usage is: The example usage is:
...@@ -1035,17 +1040,18 @@ def fc_layer(input, ...@@ -1035,17 +1040,18 @@ def fc_layer(input,
:type name: basestring :type name: basestring
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput | list | tuple :type input: LayerOutput | list | tuple
:param size: The layer dimension. :param size: The dimension of this layer.
:type size: int :type size: int
:param act: Activation Type. TanhActivation is the default activation. :param act: Activation Type. TanhActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
:param param_attr: The Parameter Attribute|list. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:param bias_attr: The bias attribute. If the parameter is set to False or an object :param bias_attr: The bias attribute. If the parameter is set to False or an object
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
parameter is set to True, the bias is initialized to zero. parameter is set to True, the bias is initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any :type bias_attr: ParameterAttribute | None | bool | Any
:param layer_attr: Extra Layer config. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute | None :type layer_attr: ExtraLayerAttribute | None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -1086,13 +1092,15 @@ def fc_layer(input, ...@@ -1086,13 +1092,15 @@ def fc_layer(input,
@wrap_name_default("print") @wrap_name_default("print")
def printer_layer(input, format=None, name=None): def printer_layer(input, format=None, name=None):
""" """
Print the output value of input layers. This layer is useful for debugging. Print the output value of the layers specified by the parameter input.
This layer is useful for debugging.
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput | list | tuple :type input: LayerOutput | list | tuple
:return: LayerOutput :return: LayerOutput object.
:rtype: LayerOutput
""" """
if isinstance(input, LayerOutput): if isinstance(input, LayerOutput):
input = [input] input = [input]
...@@ -1135,11 +1143,12 @@ def priorbox_layer(input, ...@@ -1135,11 +1143,12 @@ def priorbox_layer(input,
:param aspect_ratio: The aspect ratio. :param aspect_ratio: The aspect ratio.
:type aspect_ratio: list :type aspect_ratio: list
:param variance: The bounding box variance. :param variance: The bounding box variance.
:type min_size: The min size of the priorbox width/height. :type min_size: The minimum size of the priorbox width/height.
:param min_size: list :param min_size: list
:type max_size: The max size of the priorbox width/height. Could be NULL. :type max_size: The maximum size of the priorbox width/height. It could be NULL.
:param max_size: list :param max_size: list
:return: LayerOutput :return: LayerOutput object.
:rtype: LayerOutput
""" """
# plus one for ratio 1. # plus one for ratio 1.
num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4 num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
...@@ -1177,7 +1186,7 @@ def multibox_loss_layer(input_loc, ...@@ -1177,7 +1186,7 @@ def multibox_loss_layer(input_loc,
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input_loc: The input predict locations. :param input_loc: The input predicted locations.
:type input_loc: LayerOutput | List of LayerOutput :type input_loc: LayerOutput | List of LayerOutput
:param input_conf: The input priorbox confidence. :param input_conf: The input priorbox confidence.
:type input_conf: LayerOutput | List of LayerOutput :type input_conf: LayerOutput | List of LayerOutput
...@@ -1189,13 +1198,15 @@ def multibox_loss_layer(input_loc, ...@@ -1189,13 +1198,15 @@ def multibox_loss_layer(input_loc,
:type num_classes: int :type num_classes: int
:param overlap_threshold: The threshold of the overlap. :param overlap_threshold: The threshold of the overlap.
:type overlap_threshold: float :type overlap_threshold: float
:param neg_pos_ratio: The ratio of the negative bbox to the positive bbox. :param neg_pos_ratio: The ratio of the negative bounding box to
the positive bounding box.
:type neg_pos_ratio: float :type neg_pos_ratio: float
:param neg_overlap: The negative bbox overlap threshold. :param neg_overlap: The negative bounding box overlap threshold.
:type neg_overlap: float :type neg_overlap: float
:param background_id: The background class index. :param background_id: The background class index.
:type background_id: int :type background_id: int
:return: LayerOutput :return: LayerOutput object.
:rtype: LayerOutput
""" """
if isinstance(input_loc, LayerOutput): if isinstance(input_loc, LayerOutput):
input_loc = [input_loc] input_loc = [input_loc]
...@@ -1258,19 +1269,20 @@ def detection_output_layer(input_loc, ...@@ -1258,19 +1269,20 @@ def detection_output_layer(input_loc,
:type input_conf: LayerOutput | List of LayerOutput. :type input_conf: LayerOutput | List of LayerOutput.
:param priorbox: The input priorbox location and the variance. :param priorbox: The input priorbox location and the variance.
:type priorbox: LayerOutput :type priorbox: LayerOutput
:param num_classes: The number of the classification. :param num_classes: The number of the classes.
:type num_classes: int :type num_classes: int
:param nms_threshold: The Non-maximum suppression threshold. :param nms_threshold: The Non-maximum suppression threshold.
:type nms_threshold: float :type nms_threshold: float
:param nms_top_k: The bbox number kept of the NMS's output :param nms_top_k: The bounding boxes number kept of the NMS's output.
:type nms_top_k: int :type nms_top_k: int
:param keep_top_k: The bbox number kept of the layer's output :param keep_top_k: The bounding boxes number kept of the layer's output.
:type keep_top_k: int :type keep_top_k: int
:param confidence_threshold: The classification confidence threshold :param confidence_threshold: The classification confidence threshold.
:type confidence_threshold: float :type confidence_threshold: float
:param background_id: The background class index. :param background_id: The background class index.
:type background_id: int :type background_id: int
:return: LayerOutput :return: LayerOutput object.
:rtype: LayerOutput
""" """
if isinstance(input_loc, LayerOutput): if isinstance(input_loc, LayerOutput):
input_loc = [input_loc] input_loc = [input_loc]
...@@ -1326,7 +1338,7 @@ def roi_pool_layer(input, ...@@ -1326,7 +1338,7 @@ def roi_pool_layer(input,
A layer used by Fast R-CNN to extract feature maps of ROIs from the last A layer used by Fast R-CNN to extract feature maps of ROIs from the last
feature map. feature map.
:param name: The Layer Name. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input: The input layer. :param input: The input layer.
:type input: LayerOutput. :type input: LayerOutput.
...@@ -1338,9 +1350,10 @@ def roi_pool_layer(input, ...@@ -1338,9 +1350,10 @@ def roi_pool_layer(input,
:type pooled_height: int :type pooled_height: int
:param spatial_scale: The spatial scale between the image and feature map. :param spatial_scale: The spatial scale between the image and feature map.
:type spatial_scale: float :type spatial_scale: float
:param num_channels: number of input channel. :param num_channels: The number of the input channels.
:type num_channels: int :type num_channels: int
:return: LayerOutput :return: LayerOutput object.
:rtype: LayerOutput
""" """
if num_channels is None: if num_channels is None:
assert input.num_filters is not None assert input.num_filters is not None
...@@ -1361,18 +1374,19 @@ def roi_pool_layer(input, ...@@ -1361,18 +1374,19 @@ def roi_pool_layer(input,
@wrap_name_default("cross_channel_norm") @wrap_name_default("cross_channel_norm")
def cross_channel_norm_layer(input, name=None, param_attr=None): def cross_channel_norm_layer(input, name=None, param_attr=None):
""" """
Normalize a layer's output. This layer is necessary for ssd. Normalize a layer's output. This layer is necessary for ssd. This
This layer applys normalize across the channels of each sample to layer applys normalization across the channels of each sample to
a conv layer's output and scale the output by a group of trainable a convolutional layer's output and scales the output by a group of
factors which dimensions equal to the channel's number. trainable factors whose dimensions equal to the channel's number.
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param param_attr: The Parameter Attribute|list. :param param_attr: The parameter attribute. See ParameterAttribute for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:return: LayerOutput :return: LayerOutput object.
:rtype: LayerOutput
""" """
assert input.num_filters is not None assert input.num_filters is not None
Layer( Layer(
...@@ -1413,12 +1427,9 @@ def pooling_layer(input, ...@@ -1413,12 +1427,9 @@ def pooling_layer(input,
Pooling layer for sequence inputs, not used for Image. Pooling layer for sequence inputs, not used for Image.
If stride > 0, this layer slides a window whose size is determined by stride, If stride > 0, this layer slides a window whose size is determined by stride,
and return the pooling value of the window as the output. Thus, a long sequence and returns the pooling value of the sequence in the window as the output. Thus,
will be shorten. a long sequence will be shortened. Note that for sequence with sub-sequence, the
default value of stride is -1.
The parameter stride specifies the intervals at which to apply the pooling
operation. Note that for sequence with sub-sequence, the default value
of stride is -1.
The example usage is: The example usage is:
...@@ -1435,16 +1446,16 @@ def pooling_layer(input, ...@@ -1435,16 +1446,16 @@ def pooling_layer(input,
:type name: basestring :type name: basestring
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param pooling_type: Type of pooling, MaxPooling(default), AvgPooling, :param pooling_type: Type of pooling. MaxPooling is the default pooling.
SumPooling, SquareRootNPooling.
:type pooling_type: BasePoolingType | None :type pooling_type: BasePoolingType | None
:param stride: The step size between successive pooling regions. :param stride: The step size between successive pooling regions.
:type stride: Int :type stride: int
:param bias_attr: The bias attribute. If the parameter is set to False or an object :param bias_attr: The bias attribute. If the parameter is set to False or an object
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
parameter is set to True, the bias is initialized to zero. parameter is set to True, the bias is initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any :type bias_attr: ParameterAttribute | None | bool | Any
:param layer_attr: The Extra Attributes for layer, such as dropout. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute | None :type layer_attr: ExtraLayerAttribute | None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -6618,7 +6629,7 @@ def row_conv_layer(input, ...@@ -6618,7 +6629,7 @@ def row_conv_layer(input,
.. math:: .. math::
r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}} r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
\quad \text{for} \quad (1 \leq i \leq d) \quad \\text{for} \quad (1 \leq i \leq d)
Note: Note:
The `context_len` is `k + 1`. That is to say, the lookahead step The `context_len` is `k + 1`. That is to say, the lookahead step
...@@ -6767,7 +6778,7 @@ def gated_unit_layer(input, ...@@ -6767,7 +6778,7 @@ def gated_unit_layer(input,
The gated unit layer implements a simple gating mechanism over the input. The gated unit layer implements a simple gating mechanism over the input.
The input :math:`X` is first projected into a new space :math:`X'`, and The input :math:`X` is first projected into a new space :math:`X'`, and
it is also used to produce a gate weight :math:`\sigma`. Element-wise it is also used to produce a gate weight :math:`\sigma`. Element-wise
product between :match:`X'` and :math:`\sigma` is finally returned. product between :math:`X'` and :math:`\sigma` is finally returned.
Reference: Reference:
`Language Modeling with Gated Convolutional Networks `Language Modeling with Gated Convolutional Networks
...@@ -7463,7 +7474,7 @@ def factorization_machine(input, ...@@ -7463,7 +7474,7 @@ def factorization_machine(input,
Factorization Machine with the formula: Factorization Machine with the formula:
.. math:: .. math::
y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
Note: Note:
X is the input vector with size n. V is the factor matrix. Each row of V X is the input vector with size n. V is the factor matrix. Each row of V
......
...@@ -15,14 +15,15 @@ import backward ...@@ -15,14 +15,15 @@ import backward
import regularizer import regularizer
from param_attr import ParamAttr from param_attr import ParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, GPUPlace from core import LoDTensor, CPUPlace, CUDAPlace
from distribute_transpiler import DistributeTranspiler
import clip import clip
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr'
'DataFeeder', 'clip' 'DataFeeder', 'clip', 'DistributeTranspiler'
] ]
...@@ -41,5 +42,10 @@ def __read_gflags_from_env__(): ...@@ -41,5 +42,10 @@ def __read_gflags_from_env__():
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
if core.is_compile_gpu():
core.init_devices(["CPU", "GPU:0"])
else:
core.init_devices(["CPU"])
__read_gflags_from_env__() __read_gflags_from_env__()
import framework
from framework import Program, default_main_program, Parameter, Variable
import optimizer
from layer_helper import LayerHelper
def hash_name_to_server(params_grads, pserver_endpoints):
"""
:param param_grads:
:return: a map of pserver endpoint ->
params -> [param list]
grads -> [grad list]
"""
def _hash_param(param_name, total):
return hash(param_name) % total
param_grad_map = dict()
for param, grad in params_grads:
if param.trainable is True and grad is not None:
server_id = _hash_param(param.name, len(pserver_endpoints))
server_for_param = pserver_endpoints[server_id]
if not param_grad_map.has_key(server_for_param):
param_grad_map[server_for_param] = {"params": [], "grads": []}
param_grad_map[server_for_param]["params"].append(param)
param_grad_map[server_for_param]["grads"].append(grad)
return param_grad_map
def round_robin(params_grads, pserver_endpoints):
assert (len(params_grads) > len(pserver_endpoints))
param_grad_map = dict()
pserver_idx = 0
for param, grad in params_grads:
if param.trainable is True:
server_for_param = pserver_endpoints[pserver_idx]
if not param_grad_map.has_key(server_for_param):
param_grad_map[server_for_param] = {"params": [], "grads": []}
param_grad_map[server_for_param]["params"].append(param)
param_grad_map[server_for_param]["grads"].append(grad)
pserver_idx += 1
if pserver_idx >= len(pserver_endpoints):
pserver_idx = 0
return param_grad_map
class DistributeTranspiler:
def transpile(self,
optimize_ops,
params_grads,
program=None,
pservers="127.0.0.1:6174",
trainers=1,
split_method=round_robin):
"""
Transpile the program to a distributed data-parallelism programs.
The main_program will be transform to use a remote parameter server
to do parameter optimization. And the optimization graph will be put
in to a parameter server program.
Use different methods to split trainable varialbles to different
parameter servers.
Example to run:
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
pserver_endpoint = os.getenv("PSERVER")
if pserver_endpoint:
pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
exe.run(fluid.default_startup_program())
exe.run(pserver_prog)
else:
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
...
:param optimize_ops: op list of optimization, should be the
return value of Optimizer.minimize
:type optimize_ops: list
:param program: program to optimize, default default_main_program
:param pservers: parameter server endpoints like "m1:6174,m2:6174"
:type pservers: string
:return: return a list of programs
"""
if program is None:
program = default_main_program()
self.trainers = trainers
self._optimize_distributed(
optimize_ops,
program,
params_grads,
pservers=pservers,
trainers=trainers,
split_method=split_method)
def _clone_param(self, block, v):
assert isinstance(v, Parameter)
new_p = Parameter(
block=block,
shape=v.shape,
dtype=v.dtype,
type=v.type,
lod_level=v.lod_level,
stop_gradient=v.stop_gradient,
trainable=v.trainable,
optimize_attr=v.optimize_attr,
regularizer=v.regularizer,
name=v.name)
block.vars[new_p.name] = new_p
def _clone_var(self, block, var):
assert isinstance(var, Variable)
return block.create_var(
name=var.name,
shape=var.shape,
dtype=var.dtype,
type=var.type,
lod_level=var.lod_level,
persistable=var.persistable)
def _optimize_distributed(self, optimize_ops, program, params_and_grads,
**kwargs):
if kwargs.has_key("split_method"):
split_method = kwargs["split_method"]
else:
split_method = round_robin
assert (callable(split_method))
pserver_endpoints = kwargs["pservers"].split(",")
self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
send_op_ordered_inputs = []
epmap = []
for ep, v in self.param_grad_map.iteritems():
send_op_ordered_inputs.extend(v["grads"])
for i in v["grads"]:
epmap.append(ep)
send_op = program.global_block().append_op(
type="send",
inputs={"X": send_op_ordered_inputs
}, # inputs is a list of tensors to be send
outputs={},
attrs={"endpoints": pserver_endpoints,
"epmap": epmap})
def get_trainer_program(optimize_ops, program):
# remove optimize ops and add a send op to main_program
program.global_block().delete_ops(optimize_ops)
def _create_var_for_trainers(self, block, var, trainers):
var_list = []
for i in xrange(trainers):
var_each = block.create_var(
name="%s.trainer_%d" % (var.name, i),
psersistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
var_list.append(var_each)
return var_list
def get_pserver_program(self, endpoint, optimize_ops):
pserver_program = Program()
for v in self.param_grad_map[endpoint]["params"]:
self._clone_param(pserver_program.global_block(), v)
optimize_sub_program = Program()
grad_var_names = [
var.name for var in self.param_grad_map[endpoint]["grads"]
]
for opt_op in optimize_ops:
for _, var in opt_op.inputs.iteritems():
# NOTE: append operators to merge gradients from multiple
# trainers. If trainers == 1, this is not needed.
if self.trainers > 1 and var.name in grad_var_names:
vars2merge = self._create_var_for_trainers(
optimize_sub_program.global_block(), var, self.trainers)
merged_var = optimize_sub_program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
optimize_sub_program.global_block().append_op(
type="sum",
inputs={"X": vars2merge},
outputs={"Out": merged_var})
optimize_sub_program.global_block().append_op(
type="scale",
inputs={"X": merged_var},
outputs={"Out": merged_var},
attrs={"scale": 1.0 / float(self.trainers)})
else:
optimize_sub_program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
if opt_op.inputs.has_key("Grad"):
if opt_op.inputs["Grad"].name in grad_var_names:
print "appending ", opt_op.type, opt_op.inputs
optimize_sub_program.global_block().append_op(
type=opt_op.type,
inputs=opt_op.inputs,
outputs=opt_op.outputs,
attrs=opt_op.attrs)
else:
optimize_sub_program.global_block().append_op(
type=opt_op.type,
inputs=opt_op.inputs,
outputs=opt_op.outputs,
attrs=opt_op.attrs)
pserver_program.global_block().append_op(
type="recv",
inputs={"RX":
self.param_grad_map[endpoint]["grads"]}, # grads to recv
outputs={},
attrs={
"OptimizeProgram": optimize_sub_program.desc,
"endpoint": endpoint,
"ParamList":
[p.name for p in self.param_grad_map[endpoint]["params"]],
"GradList":
[p.name for p in self.param_grad_map[endpoint]["grads"]],
"Trainers": self.trainers
})
pserver_program.sync_with_cpp()
return pserver_program
import numpy as np import numpy as np
from . import core from . import core
from framework import Program, default_main_program from framework import Program, default_main_program, Parameter, Variable
__all__ = ['Executor', 'g_scope'] __all__ = ['Executor', 'g_scope']
...@@ -47,13 +47,14 @@ class Executor(object): ...@@ -47,13 +47,14 @@ class Executor(object):
act_places.append(p) act_places.append(p)
# TODO(dzhwinter) : consider that our fluid tests all written in # TODO(dzhwinter) : consider that our fluid tests all written in
# GPUPlace(gpu_id), this will be changed in next PR. # CUDAPlace(gpu_id), this will be changed in the future
if core.is_compile_gpu(): if core.is_compile_gpu():
core.init_devices(["CPU", "GPU:0"]) core.init_devices(["CPU", "GPU:0"])
else: else:
core.init_devices(["CPU"]) core.init_devices(["CPU"])
self.executor = core.Executor(act_places) # TODO(dzhwinter) : only use the first place
self.executor = core.Executor(act_places[0])
self.places = places self.places = places
def aslodtensor(self, data): def aslodtensor(self, data):
...@@ -148,7 +149,7 @@ class Executor(object): ...@@ -148,7 +149,7 @@ class Executor(object):
outputs={'Out': [fetch_var]}, outputs={'Out': [fetch_var]},
attrs={'col': i}) attrs={'col': i})
self.executor.run(program.desc, scope, 0, True) self.executor.run(program.desc, scope, 0, True, True)
outs = [ outs = [
core.get_fetch_variable(scope, fetch_var_name, i) core.get_fetch_variable(scope, fetch_var_name, i)
for i in xrange(len(fetch_list)) for i in xrange(len(fetch_list))
......
...@@ -17,6 +17,10 @@ TEMP_VAR_NAME = core.kTempVarName() ...@@ -17,6 +17,10 @@ TEMP_VAR_NAME = core.kTempVarName()
GRAD_VAR_SUFFIX = core.kGradVarSuffix() GRAD_VAR_SUFFIX = core.kGradVarSuffix()
ZERO_VAR_SUFFIX = core.kZeroVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
USE_CPU = core.kUseCPU()
USE_CUDNN = core.kUseMKLDNN()
USE_MKLDNN = core.kUseMKLDNN()
def grad_var_name(var_name): def grad_var_name(var_name):
""" """
...@@ -359,6 +363,10 @@ class Operator(object): ...@@ -359,6 +363,10 @@ class Operator(object):
""" """
self.block = block self.block = block
self.desc = desc self.desc = desc
# for clone a new operator
self.inputs = inputs
self.outputs = outputs
self.attrs = attrs
if len(self.desc.type()) != 0: if len(self.desc.type()) != 0:
return return
if type is None: if type is None:
...@@ -389,7 +397,10 @@ class Operator(object): ...@@ -389,7 +397,10 @@ class Operator(object):
% (in_proto.name, len(in_args))) % (in_proto.name, len(in_args)))
in_arg_names = [] in_arg_names = []
for arg in in_args: for arg in in_args:
in_arg_names.append(arg.name) if isinstance(arg, basestring):
in_arg_names.append(arg)
else:
in_arg_names.append(arg.name)
self.desc.set_input(in_proto.name, in_arg_names) self.desc.set_input(in_proto.name, in_arg_names)
else: else:
self.desc.set_input(in_proto.name, []) self.desc.set_input(in_proto.name, [])
...@@ -430,13 +441,18 @@ class Operator(object): ...@@ -430,13 +441,18 @@ class Operator(object):
continue continue
if isinstance(attrs[attr_name], Block): if isinstance(attrs[attr_name], Block):
self.desc.set_block_attr(attr_name, attrs[attr_name].desc) self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
elif isinstance(attrs[attr_name], core.BlockDesc) or \
isinstance(attrs[attr_name], core.ProgramDesc):
self.desc.set_serialized_attr(
attr_name, attrs[attr_name].serialize_to_string())
else: else:
self.desc.set_attr(attr_name, attrs[attr_name]) self.desc.set_attr(attr_name, attrs[attr_name])
self.desc.check_attrs() self.desc.check_attrs()
no_kernel_op_set = { no_kernel_op_set = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'feed', 'fetch', 'save', 'load', 'recurrent',
'rnn_memory_helper_grad', 'conditional_block', 'while' 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv'
} }
if type not in no_kernel_op_set: if type not in no_kernel_op_set:
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
...@@ -582,6 +598,7 @@ class Block(object): ...@@ -582,6 +598,7 @@ class Block(object):
self.vars = dict() # var_name --> var self.vars = dict() # var_name --> var
self.ops = collections.deque() # operator list self.ops = collections.deque() # operator list
self.program = program self.program = program
self.removed_vars = dict()
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
...@@ -638,6 +655,16 @@ class Block(object): ...@@ -638,6 +655,16 @@ class Block(object):
self.ops.append(op) self.ops.append(op)
return op return op
def delete_ops(self, ops):
# remove from cpp
# FIXME(typhoonzero): remove only the first occuracy.
try:
start = list(self.ops).index(ops[0])
end = list(self.ops).index(ops[-1])
except Exception, e:
raise e
self.desc.remove_op(start, end)
def prepend_op(self, *args, **kwargs): def prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op() op_desc = self.desc.prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
......
...@@ -194,3 +194,9 @@ class LayerHelper(object): ...@@ -194,3 +194,9 @@ class LayerHelper(object):
else: else:
# For integer and boolean types, initialize with all zeros # For integer and boolean types, initialize with all zeros
return Constant() return Constant()
def is_instance(self, param_name, cls):
param = self.kwargs.get(param_name, None)
if not isinstance(param, cls):
raise TypeError("The input {0} parameter of method {1} must be {2}",
param_name, self.layer_type, cls.__name__)
...@@ -3,6 +3,7 @@ from ..framework import Program, Variable, Operator ...@@ -3,6 +3,7 @@ from ..framework import Program, Variable, Operator
from .. import core from .. import core
from tensor import assign, fill_constant from tensor import assign, fill_constant
import contextlib import contextlib
from ..registry import autodoc
__all__ = [ __all__ = [
'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard', 'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard',
...@@ -10,7 +11,7 @@ __all__ = [ ...@@ -10,7 +11,7 @@ __all__ = [
'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor', 'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor',
'increment', 'array_write', 'create_array', 'less_than', 'array_read', 'increment', 'array_write', 'create_array', 'less_than', 'array_read',
'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock', 'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock',
'StaticRNN' 'StaticRNN', 'reorder_lod_tensor_by_rank'
] ]
...@@ -1082,3 +1083,18 @@ class DynamicRNN(object): ...@@ -1082,3 +1083,18 @@ class DynamicRNN(object):
if self.status != DynamicRNN.IN_RNN: if self.status != DynamicRNN.IN_RNN:
raise ValueError("{0} can only be invoked inside rnn block.".format( raise ValueError("{0} can only be invoked inside rnn block.".format(
method)) method))
@autodoc
def reorder_lod_tensor_by_rank(x, rank_table):
helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
helper.is_instance('x', Variable)
helper.is_instance('rank_table', Variable)
out = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(
type='reorder_lod_tensor_by_rank',
inputs={'X': [x],
'RankTable': [rank_table]},
outputs={'Out': [out]})
return out
...@@ -13,7 +13,8 @@ __all__ = [ ...@@ -13,7 +13,8 @@ __all__ = [
'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy', 'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d', 'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
'lstm_unit', 'reduce_sum' 'lstm_unit', 'reduce_sum', 'reduce_mean', 'sequence_first_step',
'sequence_last_step'
] ]
...@@ -41,7 +42,7 @@ def fc(input, ...@@ -41,7 +42,7 @@ def fc(input,
.. math:: .. math::
Out = Act\left({\sum_{i=0}^{N-1}W_iX_i + b}\right) Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
In the above equation: In the above equation:
...@@ -162,8 +163,9 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): ...@@ -162,8 +163,9 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
Examples: Examples:
.. code-block:: python .. code-block:: python
dict_size = len(dataset.ids)
data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
fc = fluid.layers.embedding(input=data, size=16) fc = fluid.layers.embedding(input=data, size=[dict_size, 16])
""" """
helper = LayerHelper('embedding', **locals()) helper = LayerHelper('embedding', **locals())
...@@ -574,9 +576,53 @@ def conv2d(input, ...@@ -574,9 +576,53 @@ def conv2d(input,
def sequence_pool(input, pool_type, **kwargs): def sequence_pool(input, pool_type, **kwargs):
""" """
This function add the operator for sequence pooling. This function add the operator for sequence pooling.
This is applied on top of the input using pool_type mentioned It pools features of all time-steps of each instance, and is applied
in the parameters. on top of the input using pool_type mentioned in the parameters.
It supports four pool_type:
- average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
- sum: :math:`Out[i] = \sum_jX_{ij}`
- sqrt: :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
- max: :math:`Out[i] = max(X_i)`
.. code-block:: text
x is a 1-level LoDTensor:
x.lod = [[0, 2, 5, 7]]
x.data = [1, 3, 2, 4, 6, 5, 1]
x.dims = [7, 1]
then output is a Tensor:
out.dim = [3, 1]
with condition len(x.lod[-1]) - 1 == out.dims[0]
for different pool_type:
average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
sum : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
sqrt : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
max : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
Args:
input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max.
Returns:
The sequence pooling variable which is a Tensor.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1)
avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
""" """
helper = LayerHelper('sequence_pool', input=input, **kwargs) helper = LayerHelper('sequence_pool', input=input, **kwargs)
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -593,6 +639,72 @@ def sequence_pool(input, pool_type, **kwargs): ...@@ -593,6 +639,72 @@ def sequence_pool(input, pool_type, **kwargs):
return pool_out return pool_out
def sequence_first_step(input, **kwargs):
"""
This funciton get the first step of sequence.
.. code-block:: text
x is a 1-level LoDTensor:
x.lod = [[0, 2, 5, 7]]
x.data = [1, 3, 2, 4, 6, 5, 1]
x.dims = [7, 1]
then output is a Tensor:
out.dim = [3, 1]
with condition len(x.lod[-1]) - 1 == out.dims[0]
out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
Args:
input(variable): The input variable which is a LoDTensor.
Returns:
The sequence's first step variable which is a Tensor.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1)
x_first_step = fluid.layers.sequence_first_step(input=x)
"""
return sequence_pool(input=input, pool_type="first")
def sequence_last_step(input, **kwargs):
"""
This funciton get the last step of sequence.
.. code-block:: text
x is a 1-level LoDTensor:
x.lod = [[0, 2, 5, 7]]
x.data = [1, 3, 2, 4, 6, 5, 1]
x.dims = [7, 1]
then output is a Tensor:
out.dim = [3, 1]
with condition len(x.lod[-1]) - 1 == out.dims[0]
out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
Args:
input(variable): The input variable which is a LoDTensor.
Returns:
The sequence's last step variable which is a Tensor.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1)
x_last_step = fluid.layers.sequence_last_step(input=x)
"""
return sequence_pool(input=input, pool_type="last")
def pool2d(input, def pool2d(input,
pool_size, pool_size,
pool_type, pool_type,
...@@ -1045,3 +1157,47 @@ def reduce_sum(input, dim=None, keep_dim=False): ...@@ -1045,3 +1157,47 @@ def reduce_sum(input, dim=None, keep_dim=False):
'reduce_all': True if dim == None else False 'reduce_all': True if dim == None else False
}) })
return out return out
def reduce_mean(input, dim=None, keep_dim=False):
"""
Computes the mean of tensor elements over the given dimension.
Args:
input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the mean is computed. If
:attr:`None`, compute the mean over all elements of :attr:`input`
and return a Tensor variable with a single element, otherwise
must be in the range :math:`[-rank(input), rank(input))`. If
:math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true.
Returns:
Variable: The reduced Tensor variable.
Examples:
.. code-block:: python
# x is a Tensor variable with following elements:
# [[0.2, 0.3, 0.5, 0.9]
# [0.1, 0.2, 0.6, 0.7]]
# Each example is followed by the correspending output tensor.
fluid.layers.reduce_mean(x) # [0.4375]
fluid.layers.reduce_mean(x, dim=0) # [0.15, 0.25, 0.55, 0.8]
fluid.layers.reduce_mean(x, dim=-1) # [0.475, 0.4]
fluid.layers.reduce_mean(x, dim=1, keep_dim=True) # [[0.475], [0.4]]
"""
helper = LayerHelper('reduce_mean', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(
type='reduce_mean',
inputs={'X': input},
outputs={'Out': out},
attrs={
'dim': dim if dim != None else 0,
'keep_dim': keep_dim,
'reduce_all': True if dim == None else False
})
return out
...@@ -207,7 +207,7 @@ class Optimizer(object): ...@@ -207,7 +207,7 @@ class Optimizer(object):
optimize_ops = self.create_optimization_pass(params_grads, loss, optimize_ops = self.create_optimization_pass(params_grads, loss,
startup_program) startup_program)
return optimize_ops return optimize_ops, params_grads
class SGDOptimizer(Optimizer): class SGDOptimizer(Optimizer):
......
...@@ -58,7 +58,9 @@ class ParamAttr(object): ...@@ -58,7 +58,9 @@ class ParamAttr(object):
def to_kwargs(self, with_initializer=False): def to_kwargs(self, with_initializer=False):
kwargs = { kwargs = {
'name': self.name, 'name': self.name,
'learning_rate': self.learning_rate, 'optimize_attr': {
'learning_rate': self.learning_rate
},
'regularizer': self.regularizer, 'regularizer': self.regularizer,
'trainable': self.trainable, 'trainable': self.trainable,
'clip_attr': self.clip 'clip_attr': self.clip
......
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from contextlib import contextmanager from contextlib import contextmanager
import os
__all__ = ['CudaProfiler'] __all__ = ['CudaProfiler']
...@@ -30,17 +31,21 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -30,17 +31,21 @@ def cuda_profiler(output_file, output_mode=None, config=None):
written into this file. written into this file.
output_mode (string) : The output mode has Key-Value pair format and output_mode (string) : The output mode has Key-Value pair format and
Comma separated values format. It should be 'kvp' or 'csv'. Comma separated values format. It should be 'kvp' or 'csv'.
config (string) : The profiler options and counters can refer to config (list of string) : The profiler options and counters can refer
"Compute Command Line Profiler User Guide". to "Compute Command Line Profiler User Guide".
""" """
if output_mode is None: if output_mode is None:
output_mode = 'csv' output_mode = 'csv'
if output_mode not in ['kvp', 'csv']: if output_mode not in ['kvp', 'csv']:
raise ValueError("The output mode must be 'kvp' or 'csv'.") raise ValueError("The output mode must be 'kvp' or 'csv'.")
config = NVPROF_CONFIG if config is None else config config = NVPROF_CONFIG if config is None else config
core.nvprof_init(output_file, output_mode, config) config_file = 'nvprof_config_file'
with open(config_file, 'wb') as fp:
fp.writelines(["%s\n" % item for item in config])
core.nvprof_init(output_file, output_mode, config_file)
# Enables profiler collection by the active CUDA profiling tool. # Enables profiler collection by the active CUDA profiling tool.
core.nvprof_start() core.nvprof_start()
yield yield
# Disables profiler collection. # Disables profiler collection.
core.nvprof_stop() core.nvprof_stop()
os.remove(config_file)
...@@ -8,7 +8,7 @@ import proto.framework_pb2 as framework_pb2 ...@@ -8,7 +8,7 @@ import proto.framework_pb2 as framework_pb2
from framework import OpProtoHolder, Variable, Program, Operator from framework import OpProtoHolder, Variable, Program, Operator
from paddle.v2.fluid.layer_helper import LayerHelper, unique_name from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
__all__ = ['deprecated', 'register_layer'] __all__ = ['deprecated', 'register_layer', 'autodoc']
def _convert_(name): def _convert_(name):
...@@ -175,12 +175,18 @@ def deprecated(func_or_class): ...@@ -175,12 +175,18 @@ def deprecated(func_or_class):
""" """
Wrap func with deprecated warning Wrap func with deprecated warning
""" """
warnings.simplefilter('always', DeprecationWarning) #turn off filter warnings.simplefilter('always', DeprecationWarning) # turn off filter
warnings.warn( warnings.warn(
"Call to deprecated function {}.".format(func.__name__), "Call to deprecated function {}.".format(func.__name__),
category=DeprecationWarning, category=DeprecationWarning,
stacklevel=2) stacklevel=2)
warnings.simplefilter('default', DeprecationWarning) #reset filter warnings.simplefilter('default', DeprecationWarning) # reset filter
return func(*args, **kwargs) return func(*args, **kwargs)
return func_wrapper return func_wrapper
def autodoc(func):
func.__doc__ = _generate_doc_string_(OpProtoHolder.instance().get_op_proto(
func.__name__))
return func
from __future__ import print_function
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import os
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=images,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.01)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
BATCH_SIZE = 50
PASS_NUM = 3
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
pserver_endpoints = os.getenv("PSERVERS")
training_role = os.getenv("TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1)
if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops)
exe.run(fluid.default_startup_program())
exe.run(pserver_prog)
elif training_role == "TRAINER":
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
# print loss, acc
if loss < 10.0 and pass_acc > 0.9:
# if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
exit(0)
pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
exit(1)
...@@ -33,7 +33,7 @@ def encoder_decoder(): ...@@ -33,7 +33,7 @@ def encoder_decoder():
fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last") encoder_out = layers.sequence_last_step(input=lstm_hidden0)
# decoder # decoder
trg_language_word = layers.data( trg_language_word = layers.data(
......
...@@ -125,10 +125,11 @@ def model(): ...@@ -125,10 +125,11 @@ def model():
# need cos sim # need cos sim
inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
scale_infer = layers.scale(x=inference, scale=5.0)
label = layers.data(name='score', shape=[1], dtype='float32') label = layers.data(name='score', shape=[1], dtype='float32')
square_cost = layers.square_error_cost(input=inference, label=label) square_cost = layers.square_error_cost(input=scale_infer, label=label)
avg_cost = layers.mean(x=square_cost) avg_cost = layers.mean(x=square_cost)
...@@ -141,7 +142,7 @@ def main(): ...@@ -141,7 +142,7 @@ def main():
opts = sgd_optimizer.minimize(cost) opts = sgd_optimizer.minimize(cost)
if USE_GPU: if USE_GPU:
place = core.GPUPlace(0) place = core.CUDAPlace(0)
else: else:
place = core.CPUPlace() place = core.CPUPlace()
......
...@@ -90,12 +90,10 @@ def get_numeric_gradient(scope, ...@@ -90,12 +90,10 @@ def get_numeric_gradient(scope,
def product(dim): def product(dim):
return reduce(lambda a, b: a * b, dim, 1) return reduce(lambda a, b: a * b, dim, 1)
ctx = core.DeviceContext.create(core.CPUPlace())
def get_output(): def get_output():
sum = [] sum = []
for output_name in output_names: for output_name in output_names:
op.run(scope, ctx) op.run(scope, core.CPUPlace())
sum.append( sum.append(
np.array(scope.find_var(output_name).get_tensor()).mean()) np.array(scope.find_var(output_name).get_tensor()).mean())
return np.array(sum).mean() return np.array(sum).mean()
...@@ -318,7 +316,7 @@ class OpTest(unittest.TestCase): ...@@ -318,7 +316,7 @@ class OpTest(unittest.TestCase):
def check_output(self, atol=1e-5): def check_output(self, atol=1e-5):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu(self.op_type): if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_output_with_place(place, atol) self.check_output_with_place(place, atol)
...@@ -381,7 +379,7 @@ class OpTest(unittest.TestCase): ...@@ -381,7 +379,7 @@ class OpTest(unittest.TestCase):
"Gradient Check On %s" % str(cpu_place)) "Gradient Check On %s" % str(cpu_place))
if core.is_compile_gpu() and self.op.support_gpu(): if core.is_compile_gpu() and self.op.support_gpu():
gpu_place = core.GPUPlace(0) gpu_place = core.CUDAPlace(0)
gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place, gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place,
output_names, no_grad_set) output_names, no_grad_set)
......
...@@ -113,8 +113,7 @@ class TestSparseAdagradOp(unittest.TestCase): ...@@ -113,8 +113,7 @@ class TestSparseAdagradOp(unittest.TestCase):
LearningRate='LearningRate', LearningRate='LearningRate',
epsilon=2.0) epsilon=2.0)
ctx = core.DeviceContext.create(place) adagrad_op.run(scope, place)
adagrad_op.run(scope, ctx)
# get and compare moment result # get and compare moment result
moment_result_array = np.array(moment) moment_result_array = np.array(moment)
...@@ -168,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase): ...@@ -168,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase):
def test_sparse_adagrad(self): def test_sparse_adagrad(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compile_gpu():
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -208,7 +208,7 @@ class TestBatchNormOp(OpTest): ...@@ -208,7 +208,7 @@ class TestBatchNormOp(OpTest):
print 'python: NHWC, NCHW, backward checking passed' print 'python: NHWC, NCHW, backward checking passed'
def test_forward_backward(self): def test_forward_backward(self):
def test_with_place(place, tensor_format, shape): def test_with_place(place, data_layout, shape):
# attr # attr
epsilon = 0.00001 epsilon = 0.00001
momentum = 0.9 momentum = 0.9
...@@ -292,12 +292,11 @@ class TestBatchNormOp(OpTest): ...@@ -292,12 +292,11 @@ class TestBatchNormOp(OpTest):
SavedVariance="saved_variance", SavedVariance="saved_variance",
# attrs # attrs
is_test=False, is_test=False,
tensor_format=tensor_format, data_layout=data_layout,
momentum=momentum, momentum=momentum,
epsilon=epsilon) epsilon=epsilon)
ctx = core.DeviceContext.create(place) batch_norm_op.run(scope, place)
batch_norm_op.run(scope, ctx)
# check forward result # check forward result
self.__assert_close(y_tensor, y_out, "y_out") self.__assert_close(y_tensor, y_out, "y_out")
...@@ -305,13 +304,13 @@ class TestBatchNormOp(OpTest): ...@@ -305,13 +304,13 @@ class TestBatchNormOp(OpTest):
self.__assert_close(saved_variance_tensor, saved_variance, self.__assert_close(saved_variance_tensor, saved_variance,
"saved_variance") "saved_variance")
self.__assert_close(mean_out_tensor, mean_out, "mean_out") self.__assert_close(mean_out_tensor, mean_out, "mean_out")
if isinstance(place, core.GPUPlace): if isinstance(place, core.CUDAPlace):
atol = 5e-2 atol = 5e-2
else: else:
atol = 1e-4 atol = 1e-4
self.__assert_close(variance_out_tensor, variance_out, self.__assert_close(variance_out_tensor, variance_out,
"variance_out", atol) "variance_out", atol)
print "op test forward passed: ", str(place), tensor_format print "op test forward passed: ", str(place), data_layout
# run backward # run backward
batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set()) batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
...@@ -320,7 +319,7 @@ class TestBatchNormOp(OpTest): ...@@ -320,7 +319,7 @@ class TestBatchNormOp(OpTest):
["y_out", "mean", "variance", "saved_mean", "saved_variance"], ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
place, place,
feed_dict={"y_out": y_grad}) feed_dict={"y_out": y_grad})
batch_norm_op_grad.run(scope, ctx) batch_norm_op_grad.run(scope, place)
x_grad_tensor = create_or_get_tensor(scope, x_grad_tensor = create_or_get_tensor(scope,
grad_var_name("x_val"), None, grad_var_name("x_val"), None,
...@@ -336,11 +335,15 @@ class TestBatchNormOp(OpTest): ...@@ -336,11 +335,15 @@ class TestBatchNormOp(OpTest):
self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad") self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad") self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad") self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
print "op test backward passed: ", str(place), tensor_format print "op test backward passed: ", str(place), data_layout
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
core.init_devices(["CPU", "GPU:0"])
else:
core.init_devices(["CPU"])
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: for data_format in ["NCHW", "NHWC"]:
test_with_place(place, data_format, [2, 3, 4, 5]) test_with_place(place, data_format, [2, 3, 4, 5])
......
...@@ -57,8 +57,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase): ...@@ -57,8 +57,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
SentenceIds="sentence_ids", SentenceIds="sentence_ids",
SentenceScores="sentence_scores") SentenceScores="sentence_scores")
ctx = core.DeviceContext.create(self.cpu_place) beam_search_decode_op.run(self.scope, self.cpu_place)
beam_search_decode_op.run(self.scope, ctx)
expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]] expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
self.assertEqual(sentence_ids.lod(), expected_lod) self.assertEqual(sentence_ids.lod(), expected_lod)
......
...@@ -14,7 +14,6 @@ def create_tensor(scope, name, np_data): ...@@ -14,7 +14,6 @@ def create_tensor(scope, name, np_data):
class BeamSearchOpTester(unittest.TestCase): class BeamSearchOpTester(unittest.TestCase):
def setUp(self): def setUp(self):
self.scope = core.Scope() self.scope = core.Scope()
self.ctx = core.DeviceContext.create(core.CPUPlace())
self._create_ids() self._create_ids()
self._create_scores() self._create_scores()
self._create_pre_ids() self._create_pre_ids()
...@@ -32,7 +31,7 @@ class BeamSearchOpTester(unittest.TestCase): ...@@ -32,7 +31,7 @@ class BeamSearchOpTester(unittest.TestCase):
level=0, level=0,
beam_size=2, beam_size=2,
end_id=0, ) end_id=0, )
op.run(self.scope, self.ctx) op.run(self.scope, core.CPUPlace())
selected_ids = self.scope.find_var("selected_ids").get_tensor() selected_ids = self.scope.find_var("selected_ids").get_tensor()
print 'selected_ids', np.array(selected_ids) print 'selected_ids', np.array(selected_ids)
print 'lod', selected_ids.lod() print 'lod', selected_ids.lod()
......
...@@ -65,8 +65,7 @@ class TestCondOp(unittest.TestCase): ...@@ -65,8 +65,7 @@ class TestCondOp(unittest.TestCase):
self.create_global_variables() self.create_global_variables()
self.create_cond_op() self.create_cond_op()
self.create_sub_net() self.create_sub_net()
ctx = core.DeviceContext.create(core.CPUPlace()) self.condop.run(self.scope, core.CPUPlace())
self.condop.run(self.scope, ctx)
return np.array(self.scope.find_var("Out").get_tensor()) return np.array(self.scope.find_var("Out").get_tensor())
def create_global_variables(self): def create_global_variables(self):
......
...@@ -63,8 +63,7 @@ class TestDynRNN(unittest.TestCase): ...@@ -63,8 +63,7 @@ class TestDynRNN(unittest.TestCase):
all_timesteps = fluid.layers.array_to_lod_tensor( all_timesteps = fluid.layers.array_to_lod_tensor(
x=out, table=rank_table) x=out, table=rank_table)
last = fluid.layers.sequence_pool( last = fluid.layers.sequence_last_step(input=all_timesteps)
input=all_timesteps, pool_type='last')
logits = fluid.layers.fc(input=last, size=1, act=None) logits = fluid.layers.fc(input=last, size=1, act=None)
loss = fluid.layers.sigmoid_cross_entropy_with_logits( loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logits, label=label) x=logits, label=label)
...@@ -101,7 +100,7 @@ class TestDynRNN(unittest.TestCase): ...@@ -101,7 +100,7 @@ class TestDynRNN(unittest.TestCase):
rnn.update_memory(mem, out_) rnn.update_memory(mem, out_)
rnn.output(out_) rnn.output(out_)
last = fluid.layers.sequence_pool(input=rnn(), pool_type='last') last = fluid.layers.sequence_last_step(input=rnn())
logits = fluid.layers.fc(input=last, size=1, act=None) logits = fluid.layers.fc(input=last, size=1, act=None)
label = fluid.layers.data(name='label', shape=[1], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='float32')
loss = fluid.layers.sigmoid_cross_entropy_with_logits( loss = fluid.layers.sigmoid_cross_entropy_with_logits(
......
import unittest import unittest
import numpy
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.op import Operator from paddle.v2.fluid.op import Operator
import numpy from paddle.v2.fluid.executor import Executor
class TestGaussianRandomOp(unittest.TestCase): class TestGaussianRandomOp(unittest.TestCase):
def setUp(self):
self.op_type = "gaussian_random"
self.inputs = {}
self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
self.outputs = ["Out"]
def test_cpu(self): def test_cpu(self):
self.gaussian_random_test(place=core.CPUPlace()) self.gaussian_random_test(place=fluid.CPUPlace())
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compile_gpu():
self.gaussian_random_test(place=core.GPUPlace(0)) self.gaussian_random_test(place=fluid.CUDAPlace(0))
def gaussian_random_test(self, place): def gaussian_random_test(self, place):
scope = core.Scope()
scope.var('Out').get_tensor() program = fluid.Program()
block = program.global_block()
op = Operator( vout = block.create_var(name="Out")
"gaussian_random", op = block.append_op(
Out='Out', type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
shape=[1000, 784],
mean=.0, op.desc.infer_var_type(block.desc)
std=1., op.desc.infer_shape(block.desc)
seed=10)
fetch_list = []
context = core.DeviceContext.create(place) for var_name in self.outputs:
op.run(scope, context) fetch_list.append(block.var(var_name))
tensor = numpy.array(scope.find_var('Out').get_tensor())
exe = Executor(place)
outs = exe.run(program, fetch_list=fetch_list)
tensor = outs[0]
self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1) self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1) self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
......
...@@ -33,8 +33,7 @@ class TestIsEmptyOp(unittest.TestCase): ...@@ -33,8 +33,7 @@ class TestIsEmptyOp(unittest.TestCase):
def one_case(self, input, target): def one_case(self, input, target):
op = Operator(type="is_empty", X=input, Out="out") op = Operator(type="is_empty", X=input, Out="out")
ctx = core.DeviceContext.create(core.CPUPlace()) op.run(self.scope, core.CPUPlace())
op.run(self.scope, ctx)
out = self.scope.var("out").get_tensor() out = self.scope.var("out").get_tensor()
self.assertEqual(np.array(out)[0], target) self.assertEqual(np.array(out)[0], target)
......
...@@ -27,7 +27,7 @@ class TestOptimizer(unittest.TestCase): ...@@ -27,7 +27,7 @@ class TestOptimizer(unittest.TestCase):
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
opts = sgd_optimizer.minimize(mean_out, init_program) opts, _ = sgd_optimizer.minimize(mean_out, init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd") self.assertEqual(sgd_op.type, "sgd")
...@@ -57,7 +57,7 @@ class TestOptimizer(unittest.TestCase): ...@@ -57,7 +57,7 @@ class TestOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
sgd_optimizer = optimizer.SGDOptimizer( sgd_optimizer = optimizer.SGDOptimizer(
learning_rate=learning_rate, global_step=global_step) learning_rate=learning_rate, global_step=global_step)
opts = sgd_optimizer.minimize(mean_out, init_program) opts, _ = sgd_optimizer.minimize(mean_out, init_program)
self.assertEqual(len(opts), 2) self.assertEqual(len(opts), 2)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd") self.assertEqual(sgd_op.type, "sgd")
......
...@@ -3,6 +3,7 @@ import numpy as np ...@@ -3,6 +3,7 @@ import numpy as np
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import os
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
...@@ -14,14 +15,16 @@ class TestProfiler(unittest.TestCase): ...@@ -14,14 +15,16 @@ class TestProfiler(unittest.TestCase):
data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.GPUPlace(0) place = fluid.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: output_file = 'cuda_profiler.txt'
with profiler.cuda_profiler(output_file, 'csv') as nvprof:
for i in range(epoc): for i in range(epoc):
input = np.random.random(dshape).astype('float32') input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input}) exe.run(fluid.default_main_program(), feed={'data': input})
os.remove(output_file)
if __name__ == '__main__': if __name__ == '__main__':
......
import unittest
import paddle.v2.fluid as fluid
import numpy
class TestReorderLoDTensor(unittest.TestCase):
def test_reorder(self):
dat = fluid.layers.data(name='input', shape=[1], lod_level=2)
dat.stop_gradient = False
rank_dat = fluid.layers.data(name='ref', shape=[1], lod_level=1)
table = fluid.layers.lod_rank_table(rank_dat)
new_dat = fluid.layers.reorder_lod_tensor_by_rank(
x=dat, rank_table=table)
loss = fluid.layers.mean(x=new_dat)
fluid.backward.append_backward_ops(loss=loss)
cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu)
exe.run(fluid.default_startup_program())
ref = fluid.Tensor()
ref_lod = [0, 3, 4, 7, 8, 14]
ref.set_lod([ref_lod])
ref.set(numpy.random.random(size=[14, 1]).astype('float32'), cpu)
input = fluid.Tensor()
lod_level_0 = numpy.random.randint(low=1, high=5, size=5)
lod_level_0 = [0] + numpy.cumsum(lod_level_0).tolist()
lod_level_1 = numpy.random.randint(low=1, high=5, size=lod_level_0[-1])
lod_level_1 = [0] + numpy.cumsum(lod_level_1).tolist()
input.set_lod([lod_level_0, lod_level_1])
input.set(
numpy.random.random(size=[lod_level_1[-1], 1]).astype('float32'),
cpu)
ig = exe.run(fluid.default_main_program(),
feed={'input': input,
'ref': ref},
fetch_list=['input@GRAD'],
return_numpy=False)[0]
self.assertAlmostEqual(numpy.array(ig).sum(), 1.0, delta=0.001)
self.assertEqual(input.lod(), ig.lod())
if __name__ == '__main__':
unittest.main()
...@@ -55,8 +55,7 @@ class TestSparseSGDOp(unittest.TestCase): ...@@ -55,8 +55,7 @@ class TestSparseSGDOp(unittest.TestCase):
Grad='Grad', Grad='Grad',
ParamOut='Param', ParamOut='Param',
LearningRate='LearningRate') LearningRate='LearningRate')
ctx = core.DeviceContext.create(place) sgd_op.run(scope, place)
sgd_op.run(scope, ctx)
# get and compare result # get and compare result
result_array = np.array(param) result_array = np.array(param)
...@@ -79,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase): ...@@ -79,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase):
def test_sparse_sgd(self): def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compile_gpu():
places.append(core.GPUPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
import unittest import unittest
import numpy
from paddle.v2.fluid.op import Operator from paddle.v2.fluid.op import Operator
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import numpy import paddle.v2.fluid as fluid
class TestUniformRandomOp(unittest.TestCase): class TestUniformRandomOp(unittest.TestCase):
def test_uniform_random_cpu(self): def setUp(self):
self.op_type = "uniform_random"
self.inputs = {}
self.attrs = {
"shape": [1000, 784],
"min": -5.0,
"max": 10.0,
"seed": 10
}
self.outputs = ["Out"]
def test_cpu(self):
self.uniform_random_test(place=core.CPUPlace()) self.uniform_random_test(place=core.CPUPlace())
def test_uniform_random_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compile_gpu():
self.uniform_random_test(place=core.GPUPlace(0)) self.uniform_random_test(place=core.CUDAPlace(0))
def uniform_random_test(self, place): def uniform_random_test(self, place):
scope = core.Scope() program = fluid.Program()
scope.var('X').get_tensor() block = program.global_block()
vout = block.create_var(name="Out")
op = Operator( op = block.append_op(
"uniform_random", type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
Out='X',
shape=[1000, 784], op.desc.infer_var_type(block.desc)
min=-5.0, op.desc.infer_shape(block.desc)
max=10.0,
seed=10) fetch_list = []
for var_name in self.outputs:
ctx = core.DeviceContext.create(place) fetch_list.append(block.var(var_name))
op.run(scope, ctx)
tensor = numpy.array(scope.find_var('X').get_tensor()) exe = fluid.Executor(place)
outs = exe.run(program, fetch_list=fetch_list)
tensor = outs[0]
self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1) self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
......
...@@ -79,8 +79,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: ...@@ -79,8 +79,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
# the prefix is sys.prefix which should always be usr # the prefix is sys.prefix which should always be usr
paddle_bin_dir = 'opt/paddle/bin' paddle_bin_dir = 'opt/paddle/bin'
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage', paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
'${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
'${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
'${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
'${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册