提交 cdd55dbc 编写于 作者: Q qiaolongfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-merge-splited-ids

...@@ -180,7 +180,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, ...@@ -180,7 +180,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
print_train_time(start_time, time.time(), num_samples) print_train_time(start_time, time.time(), num_samples)
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))), print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
# evaluation # evaluation
if not args.no_test and batch_acc: if not args.no_test and batch_acc and not args.use_reader_op:
pass_test_acc = test(exe, infer_prog, test_reader, feeder, pass_test_acc = test(exe, infer_prog, test_reader, feeder,
batch_acc) batch_acc)
print(", Test Accuracy: %f" % pass_test_acc) print(", Test Accuracy: %f" % pass_test_acc)
...@@ -277,11 +277,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -277,11 +277,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_id += 1 batch_id += 1
print_train_time(start_time, time.time(), num_samples) print_train_time(start_time, time.time(), num_samples)
if not args.no_test and batch_acc: if not args.no_test and batch_acc and not args.use_reader_op:
# we have not implement record io for test
# skip test when use args.use_reader_op
test_acc = test(startup_exe, infer_prog, test_reader, feeder, test_acc = test(startup_exe, infer_prog, test_reader, feeder,
batch_acc) batch_acc)
print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
exit(0)
def print_arguments(args): def print_arguments(args):
......
...@@ -199,7 +199,10 @@ def get_model(args): ...@@ -199,7 +199,10 @@ def get_model(args):
batched_train_reader = paddle.batch( batched_train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
train_reader, buf_size=5120), train_reader, buf_size=5120),
batch_size=args.batch_size * args.gpus) batch_size=args.batch_size * args.gpus,
batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size) drop_last=True)
batched_test_reader = paddle.batch(
train_reader, batch_size=args.batch_size, drop_last=True)
return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc return avg_cost, inference_program, optimizer, batched_train_reader,\
batched_test_reader, batch_acc
#!/bin/bash #!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst python gen_doc.py layers --submodules control_flow device io nn ops tensor detection > layers.rst
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
do do
......
...@@ -59,21 +59,3 @@ get_inference_program ...@@ -59,21 +59,3 @@ get_inference_program
.. autofunction:: paddle.fluid.io.get_inference_program .. autofunction:: paddle.fluid.io.get_inference_program
:noindex: :noindex:
save_checkpoint
---------------
.. autofunction:: paddle.fluid.io.save_checkpoint
:noindex:
load_checkpoint
---------------
.. autofunction:: paddle.fluid.io.load_checkpoint
:noindex:
clean_checkpoint
----------------
.. autofunction:: paddle.fluid.io.clean_checkpoint
:noindex:
...@@ -181,12 +181,6 @@ Print ...@@ -181,12 +181,6 @@ Print
.. autofunction:: paddle.fluid.layers.Print .. autofunction:: paddle.fluid.layers.Print
:noindex: :noindex:
is_empty
--------
.. autofunction:: paddle.fluid.layers.is_empty
:noindex:
device device
====== ======
...@@ -261,19 +255,6 @@ double_buffer ...@@ -261,19 +255,6 @@ double_buffer
.. autofunction:: paddle.fluid.layers.double_buffer .. autofunction:: paddle.fluid.layers.double_buffer
:noindex: :noindex:
random_data_generator
---------------------
.. autofunction:: paddle.fluid.layers.random_data_generator
:noindex:
Preprocessor
------------
.. autoclass:: paddle.fluid.layers.Preprocessor
:members:
:noindex:
nn nn
== ==
...@@ -613,30 +594,6 @@ roi_pool ...@@ -613,30 +594,6 @@ roi_pool
.. autofunction:: paddle.fluid.layers.roi_pool .. autofunction:: paddle.fluid.layers.roi_pool
:noindex: :noindex:
dice_loss
---------
.. autofunction:: paddle.fluid.layers.dice_loss
:noindex:
resize_bilinear
---------------
.. autofunction:: paddle.fluid.layers.resize_bilinear
:noindex:
gather
------
.. autofunction:: paddle.fluid.layers.gather
:noindex:
random_crop
-----------
.. autofunction:: paddle.fluid.layers.random_crop
:noindex:
ops ops
=== ===
...@@ -784,12 +741,6 @@ sum ...@@ -784,12 +741,6 @@ sum
.. autofunction:: paddle.fluid.layers.sum .. autofunction:: paddle.fluid.layers.sum
:noindex: :noindex:
shape
-----
.. autofunction:: paddle.fluid.layers.shape
:noindex:
sigmoid sigmoid
------- -------
...@@ -1039,3 +990,54 @@ zeros ...@@ -1039,3 +990,54 @@ zeros
.. autofunction:: paddle.fluid.layers.zeros .. autofunction:: paddle.fluid.layers.zeros
:noindex: :noindex:
detection
=========
multi_box_head
--------------
.. autofunction:: paddle.fluid.layers.multi_box_head
:noindex:
bipartite_match
---------------
.. autofunction:: paddle.fluid.layers.bipartite_match
:noindex:
target_assign
-------------
.. autofunction:: paddle.fluid.layers.target_assign
:noindex:
detection_output
----------------
.. autofunction:: paddle.fluid.layers.detection_output
:noindex:
ssd_loss
--------
.. autofunction:: paddle.fluid.layers.ssd_loss
:noindex:
detection_map
-------------
.. autofunction:: paddle.fluid.layers.detection_map
:noindex:
iou_similarity
--------------
.. autofunction:: paddle.fluid.layers.iou_similarity
:noindex:
box_coder
---------
.. autofunction:: paddle.fluid.layers.box_coder
:noindex:
...@@ -89,13 +89,6 @@ DecayedAdagradOptimizer ...@@ -89,13 +89,6 @@ DecayedAdagradOptimizer
:members: :members:
:noindex: :noindex:
RMSPropOptimizer
----------------
.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
:members:
:noindex:
Adadelta Adadelta
-------- --------
......
...@@ -23,15 +23,3 @@ profiler ...@@ -23,15 +23,3 @@ profiler
.. autofunction:: paddle.fluid.profiler.profiler .. autofunction:: paddle.fluid.profiler.profiler
:noindex: :noindex:
start_profiler
--------------
.. autofunction:: paddle.fluid.profiler.start_profiler
:noindex:
stop_profiler
-------------
.. autofunction:: paddle.fluid.profiler.stop_profiler
:noindex:
...@@ -84,7 +84,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) ...@@ -84,7 +84,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
......
...@@ -35,14 +35,15 @@ class ReaderBase { ...@@ -35,14 +35,15 @@ class ReaderBase {
class DecoratedReader : public ReaderBase { class DecoratedReader : public ReaderBase {
public: public:
explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) { explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
: ReaderBase(), reader_(reader) {
PADDLE_ENFORCE_NOT_NULL(reader_); PADDLE_ENFORCE_NOT_NULL(reader_);
} }
void ReInit() override { reader_->ReInit(); } void ReInit() override { reader_->ReInit(); }
protected: protected:
ReaderBase* reader_; std::shared_ptr<ReaderBase> reader_;
}; };
class FileReader : public ReaderBase { class FileReader : public ReaderBase {
...@@ -64,7 +65,7 @@ class ReaderHolder { ...@@ -64,7 +65,7 @@ class ReaderHolder {
public: public:
void Reset(ReaderBase* reader) { reader_.reset(reader); } void Reset(ReaderBase* reader) { reader_.reset(reader); }
ReaderBase* Get() const { return reader_.get(); } std::shared_ptr<ReaderBase> Get() const { return reader_; }
void ReadNext(std::vector<LoDTensor>* out) { void ReadNext(std::vector<LoDTensor>* out) {
PADDLE_ENFORCE_NOT_NULL(reader_); PADDLE_ENFORCE_NOT_NULL(reader_);
...@@ -76,7 +77,7 @@ class ReaderHolder { ...@@ -76,7 +77,7 @@ class ReaderHolder {
} }
private: private:
std::unique_ptr<ReaderBase> reader_; std::shared_ptr<ReaderBase> reader_;
}; };
} // namespace framework } // namespace framework
......
...@@ -19,10 +19,17 @@ limitations under the License. */ ...@@ -19,10 +19,17 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using batch_norm_bwd = mkldnn::batch_normalization_backward;
using batch_norm_fwd = mkldnn::batch_normalization_forward;
using framework::DataLayout;
using framework::Tensor;
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::reorder;
using mkldnn::stream;
using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc; using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; using platform::to_void_cast;
template <typename T> template <typename T>
using EigenArrayMap = using EigenArrayMap =
...@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) { ...@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) {
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
} }
template <typename T>
inline void *cast_const_to_void(const T *t) {
return static_cast<void *>(const_cast<T *>(t));
}
} // namespace } // namespace
template <typename T> template <typename T>
class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto data_layout_str = ctx.Attr<std::string>("data_layout");
auto data_layout = framework::StringToDataLayout(data_layout_str);
PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
"MKLDNN batch normalization handles only NCHW data layout");
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum"); const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
...@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const auto *scale = ctx.Input<Tensor>("Scale"); const auto *scale = ctx.Input<Tensor>("Scale");
const auto *shift = ctx.Input<Tensor>("Bias"); const auto *shift = ctx.Input<Tensor>("Bias");
y->mutable_data<T>(ctx.GetPlace()); PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
mean_out->mutable_data<T>(ctx.GetPlace()); x->format() != memory::format::format_undef,
variance_out->mutable_data<T>(ctx.GetPlace()); "Wrong layout/format set for Input x tensor");
const T *x_data = x->data<T>();
const T *mean_data = mean->data<T>();
const T *variance_data = variance->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
T *batch_mean_data = nullptr;
T *batch_variance_data = nullptr;
if (!is_test) { if (!is_test) {
batch_mean->mutable_data<T>(ctx.GetPlace()); batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
batch_variance->mutable_data<T>(ctx.GetPlace()); batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
} }
auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training; : mkldnn::prop_kind::forward_training;
auto dims = paddle::framework::vectorize2int(x->dims()); auto src_tz = paddle::framework::vectorize2int(x->dims());
auto scale_tz = paddle::framework::vectorize2int(scale->dims());
auto src_md = PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); const unsigned int ic = scale_tz[0];
auto dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
auto dst = mkldnn::memory{dst_pd, y->data<T>()};
unsigned flags = mkldnn::use_scale_shift; unsigned flags = mkldnn::use_scale_shift;
if (is_test) flags |= mkldnn::use_global_stats; if (is_test) flags |= mkldnn::use_global_stats;
// create mkldnn memory from input x tensor
auto src_memory =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
to_void_cast(x_data));
// create primitive descriptor for batch norm forward
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>; using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
auto batch_norm_fwd_desc = auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
bn_fwd_types::op_desc{propagation, src_md, epsilon, flags}; propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
auto batch_norm_fwd_pd = std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; std::shared_ptr<batch_norm_fwd::primitive_desc>(
new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
mkldnn_engine));
const unsigned int ic = dims[1]; // Save the pd to be used in backward pass
const std::string key = ctx.op().Output("SavedMean");
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
// MKLDNN requires a single piece of memory for scale and shift/bias data // MKLDNN requires a single piece of memory for scale and shift/bias data
const size_t scaleshift_size = 2 * ic; const size_t scaleshift_size = 2 * ic;
...@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(), copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
shift->data<T>() + ic, &scaleshift_data); shift->data<T>() + ic, &scaleshift_data);
auto scaleshift_memory = mkldnn::memory{ // crate mkldnn memory for weights(scale/shift)
batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()}; auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
scaleshift_data.data());
if (is_test) { // create mkldnn memory for output y tensor
auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);
cast_const_to_void(mean->data<T>())};
if (is_test) {
// create mkldnn memory for stats (as input)
auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
to_void_cast(mean_data));
auto variance_memory = auto variance_memory =
mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), memory(batch_norm_fwd_pd->variance_primitive_desc(),
cast_const_to_void(variance->data<T>())}; to_void_cast(variance_data));
run_batch_norm_op<typename bn_fwd_types::op_type>( run_batch_norm_op<typename bn_fwd_types::op_type>(
batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory, *batch_norm_fwd_pd, src_memory,
(const mkldnn::primitive::at &)mean_memory,
(const mkldnn::primitive::at &)variance_memory, scaleshift_memory, (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
dst); dst_memory);
} else { } else {
// create mkldnn memory for stats (as output)
auto mean_memory = auto mean_memory =
mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
cast_const_to_void(batch_mean->data<T>())}; auto variance_memory = memory(
batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);
auto variance_memory =
mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
cast_const_to_void(batch_variance->data<T>())};
run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src, run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
scaleshift_memory, dst, scaleshift_memory, dst_memory,
mean_memory, variance_memory); mean_memory, variance_memory);
} }
if (!is_test) { if (!is_test) {
const unsigned int in = dims[0]; // mkldnn only compute stats for current batch
const unsigned int sample_size = x->numel() / in / ic; // so we need compute momentum stats via Eigen lib
EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
// saved_xx is use just in this batch of data EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
EigenVectorArrayMap<T> saved_mean_e( ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
batch_mean->mutable_data<T>(ctx.GetPlace()), ic); ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
EigenVectorArrayMap<T> saved_variance_e(
batch_variance->mutable_data<T>(ctx.GetPlace()), ic); EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
saved_mean_e.setZero(); EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);
saved_variance_e.setZero();
const unsigned int x_arr_size = in * ic;
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
saved_mean_e(nc % ic) += x_arr.col(nc).sum();
}
saved_mean_e /= in * sample_size;
for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
saved_variance_e(nc % ic) +=
(x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
}
saved_variance_e /= in * sample_size;
ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
EigenVectorArrayMap<T> running_mean_arr(
mean_out->mutable_data<T>(ctx.GetPlace()), ic);
EigenVectorArrayMap<T> running_var_arr(
variance_out->mutable_data<T>(ctx.GetPlace()), ic);
auto one_minus_momentum = 1. - momentum; auto one_minus_momentum = 1. - momentum;
running_mean_arr = running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
mean_arr * momentum + saved_mean_e * one_minus_momentum; running_variance_e =
running_var_arr = variance_e * momentum + batch_variance_e * one_minus_momentum;
variance_arr * momentum + saved_variance_e * one_minus_momentum;
} }
y->set_layout(DataLayout::kMKLDNN);
y->set_format(
(memory::format)dst_memory.get_primitive_desc().desc().data.format);
} }
}; };
...@@ -217,11 +212,6 @@ template <typename T> ...@@ -217,11 +212,6 @@ template <typename T>
class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public: public:
void Compute(const paddle::framework::ExecutionContext &ctx) const override { void Compute(const paddle::framework::ExecutionContext &ctx) const override {
auto data_layout_str = ctx.Attr<std::string>("data_layout");
auto data_layout = framework::StringToDataLayout(data_layout_str);
PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
"MKLDNN batch normalization handles only NCHW data layout");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>(); auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine(); auto mkldnn_engine = dev_ctx.GetEngine();
...@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale")); auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias")); auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
diff_x->mutable_data<T>(ctx.GetPlace()); PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
diff_scale->mutable_data<T>(ctx.GetPlace()); diff_y->format() != memory::format::format_undef,
diff_shift->mutable_data<T>(ctx.GetPlace()); "Wrong layout/format set for Input diff_y tensor");
const T *x_data = x->data<T>();
const T *diff_y_data = diff_y->data<T>();
const T *batch_mean_data = batch_mean->data<T>();
const T *batch_variance_data = batch_variance->data<T>();
const T *scale_data = scale->data<T>();
const T *shift_data = shift->data<T>();
T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
auto src_tz = paddle::framework::vectorize2int(x->dims());
auto diff_src_tz = src_tz;
auto dst_tz = src_tz;
auto diff_dst_tz = dst_tz;
auto scale_tz = paddle::framework::vectorize2int(scale->dims());
PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
const unsigned int ic = scale_tz[0];
// Retrieve bn_fwd_pd from device context
const std::string key = ctx.op().Input("SavedMean");
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
auto batch_norm_fwd_pd =
std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
dev_ctx.GetBlob(key_batch_norm_fwd_pd));
PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
"Fail to find batch_norm_fwd_pd in device context");
auto dims = paddle::framework::vectorize2int(x->dims()); using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
auto src_md = // create mkldnn memory from input diff_y tensor
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); auto user_diff_dst_memory =
auto dst_md = memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); mkldnn_engine},
auto diff_src_md = to_void_cast(diff_y_data));
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto diff_dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>; // create mkldnn memory from input x tensor
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>; auto src_memory =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
to_void_cast(x_data));
auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ // for diff_dst, try to use same format as dst in forward pass
mkldnn::prop_kind::forward_training, src_md, epsilon, flags}; auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
auto batch_norm_fwd_pd = auto diff_dst_md = diff_dst_pd.desc();
bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
// create primitive descriptor for batch norm backward
unsigned flags = mkldnn::use_scale_shift;
auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags}; mkldnn::prop_kind::backward, diff_dst_md,
src_memory.get_primitive_desc().desc(), epsilon, flags};
auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd}; batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
auto src = mkldnn::memory{{src_md, mkldnn_engine}, // reorder user_diff_dst if it's not in preferred format
cast_const_to_void(x->data<T>())}; auto diff_dst_memory = user_diff_dst_memory;
primitive reorder_diff_dst;
auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(), bool is_diff_dst_reordered = false;
cast_const_to_void(batch_mean->data<T>())}; if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
diff_dst_memory = memory(diff_dst_pd);
auto variance = reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(), is_diff_dst_reordered = true;
cast_const_to_void(batch_variance->data<T>())}; }
auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
cast_const_to_void(diff_y->data<T>())};
const unsigned int ic = dims[1]; // create mkldnn memory for input tensors (src/mean/variance)
auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
to_void_cast(batch_mean_data));
auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
to_void_cast(batch_variance_data));
// MKLDNN requires a single piece of memory for scale and shift/bias data
const size_t scaleshift_size = 2 * ic; const size_t scaleshift_size = 2 * ic;
std::vector<T> scaleshift_data; std::vector<T> scaleshift_data;
scaleshift_data.reserve(scaleshift_size); scaleshift_data.reserve(scaleshift_size);
copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(), copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
shift->data<T>() + ic, &scaleshift_data); &scaleshift_data);
auto scaleshift_memory = mkldnn::memory{ // create mkldnn memory for input tensors (scale/shift)
batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()}; auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
scaleshift_data.data());
// create mkldnn memory for output diff weights (combined scale/shift)
std::vector<T> diff_scaleshift_data; std::vector<T> diff_scaleshift_data;
diff_scaleshift_data.reserve(scaleshift_size); diff_scaleshift_data.reserve(scaleshift_size);
copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
diff_shift->data<T>(), diff_shift->data<T>() + ic,
&diff_scaleshift_data);
auto diff_scaleshift_memory = auto diff_scaleshift_memory =
mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(), memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
diff_scaleshift_data.data()}; diff_scaleshift_data.data());
auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine}, // here assume diff_src is in the same format of src
static_cast<void *>(diff_x->data<T>())}; auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
run_batch_norm_op<bn_bwd_types::op_type>( // finally create batch_norm backward primitive
batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory, auto batch_norm_bwd_prim =
diff_src, diff_scaleshift_memory); batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
variance_memory, diff_dst_memory, scaleshift_memory,
diff_src_memory, diff_scaleshift_memory);
// execute optional reorder and batch_norm backward primitive
std::vector<primitive> pipeline;
if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
pipeline.push_back(batch_norm_bwd_prim);
stream(stream::kind::eager).submit(pipeline).wait();
// copy back diff sacle/shift to output tensors (diff scale/shift)
diff_scaleshift_data.resize(scaleshift_size);
auto it = std::begin(diff_scaleshift_data); auto it = std::begin(diff_scaleshift_data);
std::copy(it, std::next(it, ic), diff_scale->data<T>()); std::copy(it, std::next(it, ic), diff_scale_data);
std::copy(std::next(it, ic), std::end(diff_scaleshift_data), std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
diff_shift->data<T>()); diff_shift_data);
// set layout/format of output tensors
diff_x->set_layout(DataLayout::kMKLDNN);
diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
.desc()
.data.format);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace, REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
ops::BatchNormMKLDNNOpKernel<float>); ops::BatchNormMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace, REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::BatchNormMKLDNNGradOpKernel<float>); ops::BatchNormMKLDNNGradOpKernel<float>);
...@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel {
ctx.Input<Tensor>("Variance")->type()), ctx.Input<Tensor>("Variance")->type()),
"Variance input should be of float type"); "Variance input should be of float type");
framework::LibraryType library_{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout; framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain && if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
} }
#endif #endif
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library_); library);
} }
}; };
...@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
PADDLE_THROW("can't find Y@GRAD"); PADDLE_THROW("can't find Y@GRAD");
} }
framework::LibraryType library_{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain && if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
} }
#endif #endif
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout_, library_); layout, library);
} }
}; };
......
...@@ -20,7 +20,7 @@ namespace reader { ...@@ -20,7 +20,7 @@ namespace reader {
class BatchReader : public framework::DecoratedReader { class BatchReader : public framework::DecoratedReader {
public: public:
BatchReader(ReaderBase* reader, int batch_size) BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
: DecoratedReader(reader), batch_size_(batch_size) { : DecoratedReader(reader), batch_size_(batch_size) {
buffer_.reserve(batch_size_); buffer_.reserve(batch_size_);
} }
......
...@@ -22,7 +22,8 @@ namespace reader { ...@@ -22,7 +22,8 @@ namespace reader {
class CustomReader : public framework::DecoratedReader { class CustomReader : public framework::DecoratedReader {
public: public:
CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block, CustomReader(const std::shared_ptr<ReaderBase>& reader,
const framework::BlockDesc& sub_block,
const std::vector<std::string>& source_var_names, const std::vector<std::string>& source_var_names,
const std::vector<std::string>& sink_var_names) const std::vector<std::string>& sink_var_names)
: DecoratedReader(reader), : DecoratedReader(reader),
......
...@@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1; // kCacheSize - 2 ...@@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1; // kCacheSize - 2
class DoubleBufferReader : public framework::DecoratedReader { class DoubleBufferReader : public framework::DecoratedReader {
public: public:
explicit DoubleBufferReader( explicit DoubleBufferReader(
ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) const std::shared_ptr<ReaderBase>& reader,
platform::Place target_place = platform::CPUPlace())
: DecoratedReader(reader), place_(target_place) { : DecoratedReader(reader), place_(target_place) {
cpu_tensor_cache_.resize(kCacheSize); cpu_tensor_cache_.resize(kCacheSize);
gpu_tensor_cache_.resize(kCacheSize); gpu_tensor_cache_.resize(kCacheSize);
......
...@@ -21,7 +21,7 @@ namespace reader { ...@@ -21,7 +21,7 @@ namespace reader {
class MultiPassReader : public framework::DecoratedReader { class MultiPassReader : public framework::DecoratedReader {
public: public:
MultiPassReader(ReaderBase* reader, int pass_num) MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
: DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {} : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
......
...@@ -23,7 +23,8 @@ namespace reader { ...@@ -23,7 +23,8 @@ namespace reader {
class ShuffleReader : public framework::DecoratedReader { class ShuffleReader : public framework::DecoratedReader {
public: public:
ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0) ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
size_t seed = 0)
: DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
VLOG(10) << "Create shuffle reader of " << reader_; VLOG(10) << "Create shuffle reader of " << reader_;
if (seed_ == 0) { if (seed_ == 0) {
......
...@@ -21,7 +21,8 @@ namespace reader { ...@@ -21,7 +21,8 @@ namespace reader {
class ThreadedReader : public framework::DecoratedReader { class ThreadedReader : public framework::DecoratedReader {
public: public:
explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {} explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
: DecoratedReader(reader) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
......
...@@ -21,12 +21,17 @@ limitations under the License. */ ...@@ -21,12 +21,17 @@ limitations under the License. */
#include <unistd.h> #include <unistd.h>
#endif #endif
#include <algorithm>
#include "gflags/gflags.h" #include "gflags/gflags.h"
DEFINE_double(fraction_of_cpu_memory_to_use, 1, DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle," "Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
DEFINE_uint64(
initial_cpu_memory_in_mb, 500,
"Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
DEFINE_double( DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5, fraction_of_cuda_pinned_memory_to_use, 0.5,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
...@@ -54,7 +59,10 @@ inline size_t CpuTotalPhysicalMemory() { ...@@ -54,7 +59,10 @@ inline size_t CpuTotalPhysicalMemory() {
size_t CpuMaxAllocSize() { size_t CpuMaxAllocSize() {
// For distributed systems, it requires configuring and limiting // For distributed systems, it requires configuring and limiting
// the fraction of memory to use. // the fraction of memory to use.
return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); return std::min(
static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
CpuTotalPhysicalMemory()),
static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
} }
size_t CpuMinChunkSize() { size_t CpuMinChunkSize() {
......
...@@ -382,7 +382,7 @@ class Operator(object): ...@@ -382,7 +382,7 @@ class Operator(object):
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
'ncclInit', 'channel_create', 'channel_close', 'channel_send', 'ncclInit', 'channel_create', 'channel_close', 'channel_send',
'channel_recv', 'select' 'channel_recv', 'select', 'gen_nccl_id'
} }
def __init__(self, def __init__(self,
......
...@@ -261,9 +261,10 @@ def embedding(input, ...@@ -261,9 +261,10 @@ def embedding(input,
return tmp return tmp
# TODO(qijun): expose H0 and C0
def dynamic_lstm(input, def dynamic_lstm(input,
size, size,
h_0=None,
c_0=None,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_peepholes=True, use_peepholes=True,
...@@ -324,6 +325,13 @@ def dynamic_lstm(input, ...@@ -324,6 +325,13 @@ def dynamic_lstm(input,
(T X 4D), where T is the total time steps in this (T X 4D), where T is the total time steps in this
mini-batch, D is the hidden size. mini-batch, D is the hidden size.
size(int): 4 * hidden size. size(int): 4 * hidden size.
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the hidden size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weights. hidden-hidden weights.
...@@ -387,12 +395,20 @@ def dynamic_lstm(input, ...@@ -387,12 +395,20 @@ def dynamic_lstm(input,
cell = helper.create_tmp_variable(dtype) cell = helper.create_tmp_variable(dtype)
batch_gate = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype)
batch_cell_pre_act = helper.create_tmp_variable(dtype) batch_cell_pre_act = helper.create_tmp_variable(dtype)
inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
batch_size = input.shape[0]
if h_0:
assert h_0.shape == (batch_size, size), \
'The shape of h0 should be (batch_size, %d)' % size
inputs['H0'] = h_0
if c_0:
assert c_0.shape == (batch_size, size), \
'The shape of c0 should be (batch_size, %d)' % size
inputs['C0'] = c_0
helper.append_op( helper.append_op(
type='lstm', type='lstm',
inputs={'Input': input, inputs=inputs,
'Weight': weight,
'Bias': bias},
outputs={ outputs={
'Hidden': hidden, 'Hidden': hidden,
'Cell': cell, 'Cell': cell,
...@@ -677,11 +693,13 @@ def dynamic_gru(input, ...@@ -677,11 +693,13 @@ def dynamic_gru(input,
attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
batch_size = input.shape[0]
inputs = {'Input': input, 'Weight': weight, 'Bias': bias} inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
if h_0 != None: if h_0 != None:
assert h_0.shape == ( assert h_0.shape == (
size, size), 'The shape of h0 should be(%d, %d)' % (size, size) batch_size, size
inputs['h0'] = h_0 ), 'The shape of h0 should be(batch_size, %d)' % size
inputs['H0'] = h_0
hidden = helper.create_tmp_variable(dtype) hidden = helper.create_tmp_variable(dtype)
batch_gate = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册