diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607..96682c8f9fbb8683ad690a7c5809865e37a1920e 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -32,6 +32,15 @@ Normal
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Normal:
+
+TruncatedNormal
+------
+
+..  autoclass:: paddle.fluid.initializer.TruncatedNormal
+    :members:
+    :noindex:
+
 .. _api_fluid_initializer_Xavier:
 
 Xavier
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index bbd9429e30f4ae0dd56c5a89f71ad6e5391a7edf..534acffa7ffdc07bfcbfb328b540221c2b988de5 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -79,6 +79,7 @@ paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program
 paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
 paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
 paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
+paddle.fluid.initializer.TruncatedNormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
 paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0))
 paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
@@ -124,7 +125,7 @@ paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
-paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
+paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
 paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
@@ -168,6 +169,7 @@ paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=Non
 paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
 paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 6bcfc6cd55f02f0d4f0f6e3170e7cc19ce666a28..fee6ba40047053ed5662fe044eceb0c687bd4db9 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -25,6 +25,10 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
       in.place().which(), dst_place.which(),
       "Currently, model parallelism is only supported between CPU and CUDA");
 
+  // NOTE(yy): TransDataDevice should wait for computation of input.
+  platform::DeviceContextPool::Instance().Get(in.place())->Wait();
+  platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
+
   // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
   // the enforced checkings have been done in GetDeviceContext, so the
   // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index b4d3fa25c35fbf25b3d2fdd9fa1045dda0f773ec..9bccb1a32bf63b30351ef4428594691b0eef0b6a 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -129,6 +129,9 @@ class GradOpDescMakerBase {
 
   std::string ForwardOpType() const { return this->fwd_op_.Type(); }
 
+ protected:
+  const OpDesc& ForwardOp() const { return fwd_op_; }
+
  private:
   const OpDesc& fwd_op_;
   const std::unordered_set<std::string>& no_grad_set_;
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index 57c1b822d8d4f095f33cba2bfd5210f7ee19dd9f..0afcd85fe7c2e6806eb67797300e2731977573fb 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -183,28 +183,5 @@ void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
   output->clear_blocks();
   prune_impl(input, output, 0, -1, &dependent_vars);
 }
-
-void inference_optimize_impl(proto::ProgramDesc* input, int block_id) {
-  auto* op_field = input->mutable_blocks(block_id)->mutable_ops();
-  for (auto& op_desc : *op_field) {
-    for (auto& attr : *op_desc.mutable_attrs()) {
-      if (attr.name() == "is_test") {
-        attr.set_b(true);
-        break;
-      }
-    }
-  }
-}
-
-void InferenceOptimize(const proto::ProgramDesc& input,
-                       proto::ProgramDesc* output) {
-  *output = input;
-  int num_blocks = output->blocks_size();
-  PADDLE_ENFORCE_GT(num_blocks, 0, "ProgramDesc must have at least one block");
-  for (int i = 0; i < num_blocks; ++i) {
-    inference_optimize_impl(output, i);
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h
index 4c5a1dedd9950553e93dc681a40af179bdb8a30d..1be7cd25d099a18e6b7e2afaf34b1632f881b823 100644
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
@@ -22,8 +22,5 @@ namespace framework {
 
 void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
 
-void InferenceOptimize(const proto::ProgramDesc& input,
-                       proto::ProgramDesc* output);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index bc725e53574e813db94ad690c3bdc90454f68e1e..b8b8b2290a0f002fd379032e28590b84a1da38e9 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -25,7 +25,7 @@ namespace operators {
 
 using LoD = framework::LoD;
 
-class ArrayToLoDFunctor;
+struct ArrayToLoDFunctor;
 template <typename DeviceContext>
 struct ArrayToLoDFunctorImpl {
   const ArrayToLoDFunctor *prev_functor_;
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index 7cd67e74de6b9c4fbc718f60b4f671ccab2f9956..86a8459a79135d1fbcba6886172acc5a2abdb88b 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -13,9 +13,45 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_mul_op.h"
+#include <string>
 #include "paddle/fluid/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseMulOpGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("elementwise_mul_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    return op;
+  }
+};
+
+class ElementwiseMulOpMaker : public ElementwiseOpMaker {
+ protected:
+  virtual std::string GetName() const { return "Mul"; }
+  virtual std::string GetEquation() const { return "Out = X \\\\odot Y"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
+REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
+                  ops::ElementwiseMulOpMaker, ops::ElementwiseOpInferVarType,
+                  ops::ElementwiseMulOpGradDescMaker);
+REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
index 4437da4d95f97b5cbbca1650badf9710c26b4380..b870d08a1a28fd3e678aeb7211f7e3ec8b2c4c65 100644
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -93,8 +93,8 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
 
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = dout;  // out is not necessary
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 7182149164854038bb67a9f06cdbec8a4a0f1fb2..242a1b9ae92ade0caf1b0f1fcb5458b8b7070d84 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -59,7 +59,8 @@ class MatMulKernel : public framework::OpKernel<T> {
         RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
     auto mat_dim_b = math::CreateMatrixDescriptor(
         ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
-    blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0));
+    auto scale = static_cast<T>(context.Attr<float>("alpha"));
+    blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0));
   }
 };
 
@@ -185,7 +186,8 @@ class MatMulGradKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(context);
     auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
     auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-    blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0));
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b,
+                static_cast<T>(context.Attr<float>("alpha")), out, T(0));
   }
 
   void CalcInputGrad(const framework::ExecutionContext &context,
@@ -334,6 +336,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
                   R"DOC(If true, use the transpose of `Y`.
         )DOC")
         .SetDefault(false);
+    AddAttr<float>("alpha", "The scale of Out").SetDefault(1.0f);
     AddComment(R"DOC(
 MatMul Operator.
 
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 2a8e4af516ce9341772d4668dc993215b4aae24d..363abfb0e0c96e8a4d82124dff168f28e339a9ae 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -156,12 +156,29 @@ class MulGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class MulOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
+    retv->SetType("mul_grad");
+    retv->SetInput("X", Input("X"));
+    retv->SetInput("Y", Input("Y"));
+    retv->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    retv->SetAttrMap(Attrs());
+    return retv;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker);
 REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index c614de2eac143b3a545c60226aefa93dd72dea4f..13be6c65be58314a75124106eb09b1300305baf0 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -52,6 +52,12 @@ $$Out = scale*X$$
 )DOC");
     AddAttr<float>("scale", "The scaling factor of the scale operator.")
         .SetDefault(1.0);
+    AddAttr<float>("bias", "The bias of the scale operator.").SetDefault(0.0);
+    AddAttr<bool>(
+        "bias_after_scale",
+        "Apply bias addition after or before scaling. It is useful for "
+        "numeric stability in some circumstances.")
+        .SetDefault(true);
   }
 };
 
@@ -80,6 +86,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttr("scale", GetAttr("scale"));
+    grad_op->SetAttr("bias", 0.0f);
+    grad_op->SetAttr("bias_after_scale", true);
     return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index fe035aba81dd74d21539974beed255275be3013b..d8a199bc2b860515645b4954b49d8eb59fbd02dc 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -34,6 +34,8 @@ class ScaleKernel : public framework::OpKernel<T> {
                       "in and out should have the same dim");
 
     auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
 
     if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
       auto& in_slr = in_var->Get<framework::SelectedRows>();
@@ -45,7 +47,11 @@ class ScaleKernel : public framework::OpKernel<T> {
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
     auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(dev) = scale * eigen_in;
+    if (bias_after_scale) {
+      eigen_out.device(dev) = scale * eigen_in + bias;
+    } else {
+      eigen_out.device(dev) = scale * (eigen_in + bias);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d854e2803975543b51c50ea2bc173322d3c3ca5e
--- /dev/null
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <limits>
+#include <random>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e
+template <typename T>
+T Erfinv(T x) {
+  if (x < -1 || x > 1) {
+    return std::numeric_limits<T>::quiet_NaN();
+  } else if (x == 1.0) {
+    return std::numeric_limits<T>::infinity();
+  } else if (x == -1.0) {
+    return -std::numeric_limits<T>::infinity();
+  }
+
+  const T LN2 = 6.931471805599453094172321214581e-1;
+
+  const T A0 = 1.1975323115670912564578e0;
+  const T A1 = 4.7072688112383978012285e1;
+  const T A2 = 6.9706266534389598238465e2;
+  const T A3 = 4.8548868893843886794648e3;
+  const T A4 = 1.6235862515167575384252e4;
+  const T A5 = 2.3782041382114385731252e4;
+  const T A6 = 1.1819493347062294404278e4;
+  const T A7 = 8.8709406962545514830200e2;
+
+  const T B0 = 1.0000000000000000000e0;
+  const T B1 = 4.2313330701600911252e1;
+  const T B2 = 6.8718700749205790830e2;
+  const T B3 = 5.3941960214247511077e3;
+  const T B4 = 2.1213794301586595867e4;
+  const T B5 = 3.9307895800092710610e4;
+  const T B6 = 2.8729085735721942674e4;
+  const T B7 = 5.2264952788528545610e3;
+
+  const T C0 = 1.42343711074968357734e0;
+  const T C1 = 4.63033784615654529590e0;
+  const T C2 = 5.76949722146069140550e0;
+  const T C3 = 3.64784832476320460504e0;
+  const T C4 = 1.27045825245236838258e0;
+  const T C5 = 2.41780725177450611770e-1;
+  const T C6 = 2.27238449892691845833e-2;
+  const T C7 = 7.74545014278341407640e-4;
+
+  const T D0 = 1.4142135623730950488016887e0;
+  const T D1 = 2.9036514445419946173133295e0;
+  const T D2 = 2.3707661626024532365971225e0;
+  const T D3 = 9.7547832001787427186894837e-1;
+  const T D4 = 2.0945065210512749128288442e-1;
+  const T D5 = 2.1494160384252876777097297e-2;
+  const T D6 = 7.7441459065157709165577218e-4;
+  const T D7 = 1.4859850019840355905497876e-9;
+
+  const T E0 = 6.65790464350110377720e0;
+  const T E1 = 5.46378491116411436990e0;
+  const T E2 = 1.78482653991729133580e0;
+  const T E3 = 2.96560571828504891230e-1;
+  const T E4 = 2.65321895265761230930e-2;
+  const T E5 = 1.24266094738807843860e-3;
+  const T E6 = 2.71155556874348757815e-5;
+  const T E7 = 2.01033439929228813265e-7;
+
+  const T F0 = 1.414213562373095048801689e0;
+  const T F1 = 8.482908416595164588112026e-1;
+  const T F2 = 1.936480946950659106176712e-1;
+  const T F3 = 2.103693768272068968719679e-2;
+  const T F4 = 1.112800997078859844711555e-3;
+  const T F5 = 2.611088405080593625138020e-5;
+  const T F6 = 2.010321207683943062279931e-7;
+  const T F7 = 2.891024605872965461538222e-15;
+
+  T abs_x = abs(x);
+
+  if (abs_x <= 0.85) {
+    T r = 0.180625 - 0.25 * x * x;
+    T num =
+        (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) *
+             r +
+         A0);
+    T den =
+        (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) *
+             r +
+         B0);
+    return x * num / den;
+  }
+
+  T r = sqrt(LN2 - log(1.0 - abs_x));
+
+  T num, den;
+  if (r <= 5.0) {
+    r = r - 1.6;
+    num =
+        (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) *
+             r +
+         C0);
+    den =
+        (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) *
+             r +
+         D0);
+  } else {
+    r = r - 5.0;
+    num =
+        (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) *
+             r +
+         E0);
+    den =
+        (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) *
+             r +
+         F0);
+  }
+
+  if (x < 0) {
+    return -num / den;
+  } else {
+    return num / den;
+  }
+}
+
+template <typename T>
+struct TruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  TruncatedNormal(T mean, T std) : mean(mean), std(std) {
+    auto normal_cdf = [](T x) {
+      return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0;
+    };
+    a_normal_cdf = normal_cdf(-2.0);
+    b_normal_cdf = normal_cdf(2.0);
+  }
+
+  T operator()(T value) const {
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return (std::sqrt(2.0) * Erfinv(2 * p - 1) + mean) * std;
+  }
+};
+
+template <typename T>
+class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
+    TruncatedNormal<T> truncated_normal(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = truncated_normal(dist(engine));
+    }
+  }
+};
+
+class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of TruncatedGaussianRandomOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> out_dim;
+    out_dim.reserve(shape.size());
+    for (auto dim : shape) {
+      out_dim.push_back(static_cast<int64_t>(dim));
+    }
+    PADDLE_ENFORCE(shape.size() > 0UL,
+                   "shape can be one int or array. shape must be set.");
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+    return framework::OpKernelType(
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
+        ctx.device_context(), layout, library);
+  }
+};
+
+class TruncatedGaussianRandomOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("Out", "Output tensor of truncated gaussian random op.");
+
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "The dimension of random tensor.");
+    AddAttr<float>("mean",
+                   "(float, default 0.0) "
+                   "mean of random tensor.")
+        .SetDefault(.0f);
+    AddAttr<float>("std",
+                   "(float, default 1.0) "
+                   "std of random tensor.")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed of generator."
+                 "0 means use system wide seed."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time.")
+        .SetDefault(0);
+    AddAttr<int>("dtype",
+                 "(int, default 5(FP32)) "
+                 "Output data type.")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddComment(R"DOC(
+TruncatedGaussianRandom Operator.
+
+Used to initialize tensors with truncated gaussian random generator.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
+                             ops::TruncatedGaussianRandomOp,
+                             ops::TruncatedGaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(truncated_gaussian_random,
+                       ops::CPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ad2a9021bfe344d838dff2040b3fb9371274e218
--- /dev/null
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include <limits>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct TruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+
+  __host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed)
+      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return (std::sqrt(2.0) * erfinvf(2 * p - 1) + mean) * std;
+  }
+};
+
+template <typename T>
+class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T mean = static_cast<T>(context.Attr<float>("mean"));
+    T std = static_cast<T>(context.Attr<float>("std"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    int64_t size = tensor->numel();
+    thrust::transform(
+        index_sequence_begin, index_sequence_begin + size,
+        thrust::device_ptr<T>(data),
+        TruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    truncated_gaussian_random,
+    paddle::operators::GPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 791138a8c0eb3c477942a8b723206a8f8a3eac77..16eac1ec2406c147fa765bc014038ae03a1416b2 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -1,16 +1,16 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@@ -138,6 +138,10 @@ class WhileGradOp : public framework::OperatorBase {
         auto inside_og_name = inside_og_names[i];
         VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
                 << inside_og_name;
+        if (scope.FindVar(outside_og_name) == nullptr) {
+          continue;
+        }
+
         auto &og_outside =
             detail::Ref(scope.FindVar(outside_og_name),
                         "Cannot find Outside Gradient %s", outside_og_name);
@@ -167,20 +171,46 @@ class WhileGradOp : public framework::OperatorBase {
               PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
             }
           }
+        } else {
+          PADDLE_THROW("Currently only support LoDTensor and LoDTensorArray.");
         }
       }
       executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true,
                                   true);
 
-      auto &pg_names = Outputs(kXGRAD);
+      // The Outputs(kXGRAD) contains the names of the gradient of parameters
+      // and inputs.
+      auto &pg_ig_names = Outputs(kXGRAD);
       auto &p_names = Inputs(kX);
-      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
-      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
-        if (pg_names[param_id] == framework::kEmptyVarName) {
+      PADDLE_ENFORCE_EQ(pg_ig_names.size(), p_names.size());
+      for (size_t param_id = 0; param_id < pg_ig_names.size(); ++param_id) {
+        if (pg_ig_names[param_id] == framework::kEmptyVarName) {
           continue;  // parameter doesn't have gradient
         }
         auto inside_grad_name = framework::GradVarName(p_names[param_id]);
 
+        // for some grad_op, their input doesn't have gradient,
+        // for example lookup_table_grad_op, the input(Idx) doesn't have
+        // gradient.
+        auto pg_ig_var = cur_scope.FindVar(inside_grad_name);
+        PADDLE_ENFORCE(pg_ig_var != nullptr);
+        if (pg_ig_var->IsType<framework::LoDTensorArray>()) {
+          auto pg_ig_lod_t_arr =
+              pg_ig_var->GetMutable<framework::LoDTensorArray>();
+          bool empty = true;
+          for (auto &each : *pg_ig_lod_t_arr) {
+            if (each.numel() != 0) {
+              empty = false;
+              break;
+            }
+          }
+          if (empty) {
+            LOG(WARNING) << pg_ig_names[param_id]
+                         << " is not found in cur_scope.";
+            continue;
+          }
+        }
+
         //  // TODO(tonyyang-svail): Not sure we need the following
         //  // If does not compute gradient of that variable inside rnn,
         //  just
@@ -194,6 +224,11 @@ class WhileGradOp : public framework::OperatorBase {
         if (cur_scope_iter == step_scopes->rbegin()) {
           auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
           PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
+          PADDLE_ENFORCE(var->IsType<framework::LoDTensorArray>() ||
+                             var->IsType<LoDTensor>(),
+                         "Currently the type of var only can be LoDTensorArray "
+                         "or LoDTensor.");
+
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
@@ -201,7 +236,7 @@ class WhileGradOp : public framework::OperatorBase {
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
-            auto var_name = pg_names[param_id];
+            auto var_name = pg_ig_names[param_id];
             auto zero_op = framework::OpRegistry::CreateOp(
                 "fill_constant", framework::VariableNameMap{},
                 {{"Out", {var_name}}}, attrs);
@@ -213,8 +248,8 @@ class WhileGradOp : public framework::OperatorBase {
         }
         auto new_inside_name = cur_scope.Rename(inside_grad_name);
         auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}},
+            "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}},
+            {{"Out", {pg_ig_names[param_id]}}},
             framework::AttributeMap{{"use_mkldnn", {false}}});
         sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
@@ -281,6 +316,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
              parent_block->FindVarRecursive(input_name) != nullptr)) {
           continue;
         }
+
         output_grads.insert(input_name);
       }
       for (auto &output_name : op->OutputArgumentNames()) {
@@ -309,13 +345,13 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
     auto p_names = op_desc.Input(kX);
-    auto pg_names = op_desc.Output(framework::GradVarName(kX));
+    auto pg_ig_names = op_desc.Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
       auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
-      auto *g_var = block->FindVarRecursive(pg_names[i]);
+      auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
       if (g_var != nullptr) {  // Gradient could be @EMPTY@
-        VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
+        VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
                 << " type: " << p_var.GetType();
         g_var->SetType(p_var.GetType());
         g_var->SetDataType(p_var.GetDataType());
@@ -333,21 +369,21 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
     ctx->HasInputs(framework::GradVarName(kOutputs));
 
     auto p_names = ctx->Inputs(kX);
-    auto pg_names = ctx->Outputs(kXGRAD);
+    auto pg_ig_names = ctx->Outputs(kXGRAD);
     auto var_types = ctx->GetInputsVarType(kX);
     std::vector<std::string> names_to_set;
     std::vector<framework::DDim> dims_to_set;
     for (size_t i = 0; i < p_names.size(); ++i) {
-      if (pg_names[i] == framework::kEmptyVarName) {
+      if (pg_ig_names[i] == framework::kEmptyVarName) {
         continue;
       }
       auto dims = ctx->GetInputsElementDim(kX, i);
       if (var_types[i] == framework::proto::VarType::LOD_TENSOR) {
-        names_to_set.push_back(pg_names[i]);
+        names_to_set.push_back(pg_ig_names[i]);
         dims_to_set.push_back(dims);
       } else if (var_types[i] == framework::proto::VarType::LOD_TENSOR_ARRAY) {
         // not sure how to set the dim of LOD_TENSOR_ARRAY
-        names_to_set.push_back(pg_names[i]);
+        names_to_set.push_back(pg_ig_names[i]);
         dims_to_set.push_back(dims);
       }
     }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8bc30fc123163983f4bddc19af489920db93e0c0..1d081f89c491ae33e34ce282d7966ad6fe1cd51f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -396,11 +396,6 @@ All parameter, weight, gradient are variables in Paddle.
     Prune(*prog_with_targets.Proto(), &pruned_desc);
     return new ProgramDesc(pruned_desc);
   });
-  m.def("inference_optimize", [](ProgramDesc &origin) {
-    proto::ProgramDesc pruned_desc;
-    InferenceOptimize(*(origin.Proto()), &pruned_desc);
-    return new ProgramDesc(pruned_desc);
-  });
   m.def("empty_var_name",
         []() { return std::string(framework::kEmptyVarName); });
   m.def("grad_var_suffix",
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 77b9b36e68c88eab35bcc1a88ce08a7b5940d55f..f50a68c54114d5cce15418ad22f38c83163ba866 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -108,7 +108,15 @@ function cmake_gen() {
            fi
         fi
     fi
-
+    
+    if [ "$SYSTEM" == "Darwin" ]; then
+        WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
+        WITH_AVX=${WITH_AVX:-ON}
+        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-~/.cache/inference_demo}
+    else
+        INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
+    fi
+    
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -136,7 +144,7 @@ function cmake_gen() {
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
-        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
@@ -168,9 +176,10 @@ EOF
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} \
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DPY_VERSION=${PY_VERSION:-2.7}
+
 }
 
 function abort(){
@@ -232,8 +241,8 @@ function build_mac() {
     ============================================
 EOF
     make clean
-    sudo make -j 8
-    sudo make install -j 8
+    make -j 8
+    make install -j 8
 }
 
 function build_android() {
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index c0c8e0d58b0d24c07d10f2cf014f545ebf2515f7..a5a3a70828abf87594dfa2b90f6bf6dab6b9fe8b 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1738,8 +1738,6 @@ class Program(object):
         Returns:
             Program: The new program.
         """
-        # this is an alternative implement before
-        # core.inference_optimize being fixed.
         res = Program()
         res.desc = core.ProgramDesc(self.desc)
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index bd46ed8e50c9344d471578eb0f89b7e214d62722..7a7a0078a557c47492a4396897aafabe6c9c5dcb 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -20,10 +20,10 @@ import contextlib
 from .core import VarDesc
 
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
-    'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
-    'UniformInitializer', 'NormalInitializer', 'XavierInitializer',
-    'BilinearInitializer', 'MSRAInitializer'
+    'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
+    'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
+    'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer',
+    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer'
 ]
 
 _force_init_on_cpu_ = False
@@ -33,6 +33,8 @@ def force_init_on_cpu():
     """
     The flag of whether force to init variables on CPU.
 
+    Returns::
+
     Examples:
         .. code-block:: python
 
@@ -272,6 +274,60 @@ class NormalInitializer(Initializer):
         return op
 
 
+class TruncatedNormalInitializer(Initializer):
+    """Implements the Random TruncatedNormal(Gaussian) distribution initializer
+
+    Args:
+        loc (float): mean of the normal distribution
+        scale (float): standard deviation of the normal distribution
+        seed (int): random seed
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0))
+    """
+
+    def __init__(self, loc=0.0, scale=1.0, seed=0):
+        assert loc is not None
+        assert scale is not None
+        assert seed is not None
+        super(NormalInitializer, self).__init__()
+        self._mean = loc
+        self._std_dev = scale
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add truncated normal distribution initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+        op = block._prepend_op(
+            type="truncated_gaussian_random",
+            outputs={"Out": var},
+            attrs={
+                "shape": var.shape,
+                "dtype": int(var.dtype),
+                "mean": self._mean,
+                "std": self._std_dev,
+                "seed": self._seed
+            })
+        var.op = op
+        return op
+
+
 class XavierInitializer(Initializer):
     """
     This class implements the Xavier weight initializer from the paper
@@ -583,6 +639,7 @@ class BilinearInitializer(Initializer):
 Constant = ConstantInitializer
 Uniform = UniformInitializer
 Normal = NormalInitializer
+TruncatedNormal = TruncatedNormalInitializer
 Xavier = XavierInitializer
 MSRA = MSRAInitializer
 Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b0b3b0ad6f2119dd63bc82a38ba22a33cae5bc80..c7df815175c912d3e0e476a53c370d42cf45e5e0 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -113,6 +113,7 @@ __all__ = [
     'pad2d',
     'unstack',
     'sequence_enumerate',
+    'expand',
     'sequence_concat',
 ]
 
@@ -3498,7 +3499,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     return out
 
 
-def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
+def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     """
     Applies matrix multiplication to two tensors.
 
@@ -3532,6 +3533,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
         y (Variable): The input variable which is a Tensor or LoDTensor.
         transpose_x (bool): Whether to transpose :math:`x` before multiplication.
         transpose_y (bool): Whether to transpose :math:`y` before multiplication.
+        alpha (float): The scale of output. Default 1.0.
         name(str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.
 
@@ -3599,8 +3601,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
         inputs={'X': x,
                 'Y': y},
         outputs={'Out': out},
-        attrs={'transpose_X': transpose_x,
-               'transpose_Y': transpose_y})
+        attrs={
+            'transpose_X': transpose_x,
+            'transpose_Y': transpose_y,
+            'alpha': alpha,
+        })
     return out
 
 
@@ -6118,3 +6123,53 @@ def unstack(x, axis=0, num=None):
         attrs={'axis': axis,
                'num': num})
     return outs
+
+
+def expand(x, expand_times, name=None):
+    """Expand operator tiles the input by given times number. You should set times
+    number for each dimension by providing attribute 'expand_times'. The rank of X
+    should be in [1, 6]. Please note that size of 'expand_times' must be the same
+    with X's rank. Following is a using case:
+
+
+    .. code-block:: text
+
+        Input(X) is a 3-D tensor with shape [2, 3, 1]:
+        
+                [
+                   [[1], [2], [3]],
+                   [[4], [5], [6]]
+                ]
+        
+        Attr(expand_times):  [1, 2, 2]
+        
+        Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+        
+                [
+                    [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+                    [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+                ]
+        
+    Args:
+        x (Variable): A tensor with rank in [1, 6].
+        expand_times (list|tuple): Expand times number for each dimension.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor. After expanding, size of each dimension of Output(Out) is equal to ithe size of the corresponding dimension of Input(X) multiplying the corresponding value given by expand_times.
+
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            out = fluid.layers.expand(x=x, expand_times=[1, 2, 2])
+    """
+    helper = LayerHelper('expand', input=x, **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='expand',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'expand_times': expand_times})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f57be780024f5210e6d547544e9387c815a951d3..88d36fe639c7437b478efb2ee292d792677403d9 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -72,15 +72,14 @@ endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
-    if (NOT APPLE)
-        set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
     endif(NOT APPLE)
     py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
     py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
-    
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index d84dab1499a267ca081c2e8ea2856c7c4bb627cb..3191eb94d753435d31f1849be2d97b1cf89b220c 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -144,6 +144,142 @@ class TestDynRNN(unittest.TestCase):
         # loss should be small after 100 mini-batch
         self.assertLess(val[0], loss_0[0])
 
+    # this unit test is just used to the two layer nested dyn_rnn.
+    def test_train_nested_dyn_rnn(self):
+        word_dict = [i for i in range(30)]
+
+        def fake_reader():
+            seq_len, label = [[2, 2]], [0, 1]
+            data = []
+            for ele in seq_len:
+                for j in ele:
+                    data.append([numpy.random.randint(30) \
+                                 for _ in range(j)])
+
+            while True:
+                yield data, label
+
+        train_data = paddle.batch(fake_reader, batch_size=2)
+
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=2)
+            label = fluid.layers.data(
+                name='label', shape=[1], dtype='float32', lod_level=1)
+
+            rnn = fluid.layers.DynamicRNN()
+            with rnn.block():
+                in_ = rnn.step_input(sentence)
+                sent_emb = fluid.layers.embedding(
+                    input=in_, size=[len(word_dict), 32], dtype='float32')
+                out_ = fluid.layers.fc(input=sent_emb, size=100, act='tanh')
+
+                rnn1 = fluid.layers.DynamicRNN()
+                with rnn1.block():
+                    in_1 = rnn1.step_input(out_)
+                    out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh')
+                    rnn1.output(out_1)
+
+                last = fluid.layers.sequence_last_step(input=rnn1())
+                rnn.output(last)
+
+            last = rnn()
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(1e-3)
+            #sgd = fluid.optimizer.Adam(1e-3)
+            sgd.minimize(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+        data = next(train_data())
+        val = exe.run(main_program, feed=feeder.feed(data),
+                      fetch_list=[loss])[0]
+
+        for _ in range(100):
+            val = exe.run(main_program,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss])[0]
+            print(val)
+
+    # this unit test is just used to the two layer nested dyn_rnn.
+    def test_train_nested_dyn_rnn2(self):
+        word_dict = [i for i in range(30)]
+
+        def fake_reader():
+            seq_len, label = [[2, 2]], [0, 1]
+            data = []
+            for ele in seq_len:
+                for j in ele:
+                    data.append([numpy.random.randint(30) \
+                                 for _ in range(j)])
+
+            while True:
+                yield data, label
+
+        train_data = paddle.batch(fake_reader, batch_size=2)
+        hidden_size = 32
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=2)
+            label = fluid.layers.data(
+                name='label', shape=[1], dtype='float32', lod_level=1)
+
+            rnn = fluid.layers.DynamicRNN()
+            with rnn.block():
+                in_ = rnn.step_input(sentence)
+                sent_emb = fluid.layers.embedding(
+                    input=in_,
+                    size=[len(word_dict), hidden_size],
+                    dtype='float32')
+                input_forward_proj = fluid.layers.fc(input=sent_emb,
+                                                     size=hidden_size * 4,
+                                                     act=None,
+                                                     bias_attr=False)
+                forward, _ = fluid.layers.dynamic_lstm(
+                    input=input_forward_proj,
+                    size=hidden_size * 4,
+                    use_peepholes=False)
+
+                rnn1 = fluid.layers.DynamicRNN()
+                with rnn1.block():
+                    in_1 = rnn1.step_input(forward)
+                    out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh')
+                    rnn1.output(out_1)
+
+                last = fluid.layers.sequence_last_step(input=rnn1())
+                rnn.output(last)
+
+            last = rnn()
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(1e-3)
+            #sgd = fluid.optimizer.Adam(1e-3)
+            sgd.minimize(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+        data = next(train_data())
+        val = exe.run(main_program, feed=feeder.feed(data),
+                      fetch_list=[loss])[0]
+
+        for _ in range(100):
+            val = exe.run(main_program,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss])[0]
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index b04346b052903959f44aa96f6fccb7d20652e854..7a97d907f42eba3f39c7366ce0aaa29f2c3270b1 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -565,6 +565,13 @@ class TestBook(unittest.TestCase):
             out = layers.cross_entropy(x, label, False, 4)
             self.assertIsNotNone(out)
 
+    def test_expand(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="input", shape=[10], dtype='int32')
+            out = layers.expand(x, [1, 2])
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..4abeae77d26e8def85596aefc6c2f89cd4e4d6f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+
+
+class TestTrunctedGaussianRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "truncated_gaussian_random"
+        self.inputs = {}
+        self.attrs = {
+            "shape": [10000],
+            "mean": .0,
+            "std": 1.,
+            "seed": 10,
+        }
+
+        self.outputs = ["Out"]
+
+    def test_cpu(self):
+        self.gaussian_random_test(place=fluid.CPUPlace())
+
+    def test_gpu(self):
+        if core.is_compiled_with_cuda():
+            self.gaussian_random_test(place=fluid.CUDAPlace(0))
+
+    def gaussian_random_test(self, place):
+
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
+        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
+        self.assertAlmostEqual(numpy.var(tensor), 0.773, delta=0.1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/requirements.txt b/python/requirements.txt
index f8298a63612cb217ce0e711e78fffdf86b73313d..84cf440397b994ba12fa70d9e316e788f34e2415 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -2,7 +2,7 @@ requests==2.9.2
 numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
 protobuf==3.1
 recordio>=0.1.0
-matplotlib
+matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
 rarfile
 scipy>=0.19.0
 Pillow