diff --git a/cmake/version.cmake b/cmake/version.cmake
index cde650128a068faf32f4abfff5cdfdeb656d8577..79b8e8ac496250d85427b77fbd6a9924a962a15b 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -1,16 +1,21 @@
 # Get the latest git tag.
 set(PADDLE_VERSION $ENV{PADDLE_VERSION})
 set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
   execute_process(
-    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_TAG_NAME
     RESULT_VARIABLE GIT_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
   if (NOT ${GIT_RESULT})
     # Check the tag is a correct version
-    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+    if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+      # if no tag was found, set PADDLE_VERSION to latest
+      set(PADDLE_VERSION "latest")
+    elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
       string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
     else()  # otherwise, get the previous git tag name.
       set(tmp_version "${GIT_TAG_NAME}~1")
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index f1261dee0319440995951d1bee145404186a8ad4..af75baa5c4b98f7d092834c05eb57e9c7e131b29 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -21,8 +21,8 @@ namespace framework {
 // a static local variable is already being initialized.
 // https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  static OpInfoMap* g_op_info_map = new OpInfoMap();
-  return *g_op_info_map;
+  static OpInfoMap g_op_info_map;
+  return g_op_info_map;
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 4194ba197948b47003863196efdac1c08a7ae4f6..01a8501dd4abe73cbc71dc4c08734cae66df08ef 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -19,8 +19,9 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size)
+BuddyAllocator::BuddyAllocator(
+    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
+    size_t max_chunk_size)
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28..f0c83efc23ce39c4fc89296d672e1e55751851bf 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <mutex>  // NOLINT
 #include <set>
 #include <tuple>
@@ -32,8 +33,8 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
-                 size_t max_chunk_size);
+  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
+                 size_t min_chunk_size, size_t max_chunk_size);
 
   ~BuddyAllocator();
 
@@ -103,7 +104,7 @@ class BuddyAllocator {
 
  private:
   /*! Allocate CPU/GPU memory from system */
-  SystemAllocator* system_allocator_;
+  std::unique_ptr<SystemAllocator> system_allocator_;
   std::mutex mutex_;
 };
 
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index bd98ed81899440a46415d30b6d74fec2dac4c155..7c800b3c164049244770ceb2070b177d8307e85e 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
+
 #include "paddle/fluid/memory/malloc.h"
 
 #include "glog/logging.h"
@@ -34,12 +36,15 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;
 
 BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::once_flag init_flag;
   static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
+
+  std::call_once(init_flag, []() {
+    a = new detail::BuddyAllocator(
+        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+  });
+
   return a;
 }
 
@@ -68,27 +73,33 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 #ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator** a_arr = nullptr;
+
+  std::call_once(init_flag, [gpu_id]() {
     int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
+    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
+                   gpu_num);
+
+    a_arr = new BuddyAllocator*[gpu_num];
+    for (int i = 0; i < gpu_num; i++) {
+      a_arr[i] = nullptr;
+      platform::SetDeviceId(i);
+      a_arr[i] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
     }
-  }
+  });
+
   platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
+  return a_arr[gpu_id];
 }
 
 template <>
@@ -125,12 +136,16 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
 }
 
 BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
-  static BuddyAllocator* ba = NULL;
-  if (ba == NULL) {
-    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+  static std::once_flag init_flag;
+  static BuddyAllocator* ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::CUDAPinnedAllocator),
                             platform::CUDAPinnedMinChunkSize(),
                             platform::CUDAPinnedMaxChunkSize());
-  }
+  });
+
   return ba;
 }
 
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 8cc1d94260baccfe28d213b7e021956819e2e79e..580fde753816c30b188b8a99cc63fcbafde64e25 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -205,9 +205,10 @@ class ConditionalBlockGradInferShape : public framework::InferShapeBase {
       context->SetOutputsDim(framework::GradVarName("Params"),
                              context->GetInputsDim("Params"));
     }
-    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
+    if (context->HasOutputs(framework::GradVarName("X"))) {
+      context->SetOutputsDim(framework::GradVarName("X"),
+                             context->GetInputsDim("X"));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index a16861b3b77fc980ab932b9d88859b38ec36108b..2dc1467b0d4816d5cc0535eb62e936cf342a241c 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    auto &mask_dim = mask.dims();
+    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
+                   "Input(InTrue) or Input(InFalse) should be initialized.");
 
+    auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
@@ -59,19 +61,27 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     }
     auto *mask_data = cpu_mask->data<bool>();
 
-    int rank = in_true.dims().size();
-    platform::Place place = in_true.place();
-    std::type_index data_type = in_true.type();
-    framework::DDim in_true_dims =
-        framework::slice_ddim(in_true.dims(), 1, rank);
-
+    platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
 
-    auto in_true_dim_vec = framework::vectorize(in_true_dims);
-    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+    std::type_index data_type =
+        in_true.IsInitialized() ? in_true.type() : in_false.type();
+    int rank;
+    framework::DDim in_dims;
+    if (in_true.IsInitialized()) {
+      rank = in_true.dims().size();
+      in_dims = framework::slice_ddim(in_true.dims(), 1, rank);
+    } else {
+      rank = in_false.dims().size();
+      in_dims = framework::slice_ddim(in_false.dims(), 1, rank);
+    }
+
+    auto in_dim_vec = framework::vectorize(in_dims);
+    in_dim_vec.insert(in_dim_vec.begin(), batch_size);
 
-    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    framework::DDim out_dims = framework::make_ddim(in_dim_vec);
     out->Resize(out_dims);
+
     out->mutable_data(place, data_type);
 
     auto *out_lod = out->mutable_lod();
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 07555e98c3118b47153eb8dc84ebc74d4408c2a6..1b656ea138ea9623e4ee45313342a3d1bd9e2b6f 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -50,14 +50,14 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
 
   static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
                                         const framework::DDim &in_dims) {
-    int num_squeeze_dims = squeeze_dims.size();
+    int num_squeeze_dims = static_cast<int>(squeeze_dims.size());
     int cnt_squeezed_dims = 0;
     bool should_squeeze[9] = {false};
 
     // Determines number of dimensions of output tensor after squeeze.
     // Mark and count the dimensions need to be squeezed
     if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < in_dims.size(); ++idx) {
+      for (int idx = 0; idx < static_cast<int>(in_dims.size()); ++idx) {
         if (in_dims[idx] == 1) {
           should_squeeze[idx] = true;
           ++cnt_squeezed_dims;
@@ -84,7 +84,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
 
     // Make output dimensions
     std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
+    for (int in_idx = 0, out_idx = 0; in_idx < static_cast<int>(in_dims.size());
+         ++in_idx) {
       if (!should_squeeze[in_idx]) {
         output_shape[out_idx++] = in_dims[in_idx];
       }
@@ -151,6 +152,8 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         Case 2:
           Given
             X.shape = (1, 3, 1, 5)
+          and 
+            axes = []
           we get:
             Out.shape = (3, 5)
     )DOC");
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
similarity index 66%
rename from python/paddle/fluid/tests/test_mnist_if_else_op.py
rename to python/paddle/fluid/tests/test_if_else_op.py
index d34f52db5ffc889f17513d034ad2c99f696b0cdf..1b58925599de62510ea9048f5210bb0b7e49f933 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -14,10 +14,11 @@
 
 import paddle
 import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
+from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
+import paddle.fluid as fluid
 import unittest
 import numpy as np
 
@@ -31,14 +32,13 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
             true_image, false_image = layers.split_lod_tensor(
                 input=image, mask=cond)
 
             true_out = layers.create_tensor(dtype='float32')
-            true_cond = layers.ConditionalBlock([true_image])
+            true_cond = layers.ConditionalBlock([cond])
 
             with true_cond.block():
                 hidden = layers.fc(input=true_image, size=100, act='tanh')
@@ -46,7 +46,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 layers.assign(input=prob, output=true_out)
 
             false_out = layers.create_tensor(dtype='float32')
-            false_cond = layers.ConditionalBlock([false_image])
+            false_cond = layers.ConditionalBlock([cond])
 
             with false_cond.block():
                 hidden = layers.fc(input=false_image, size=200, act='tanh')
@@ -64,7 +64,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
+            batch_size=10)
 
         place = core.CPUPlace()
         exe = Executor(place)
@@ -94,8 +94,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
 
             label = layers.data(name='y', shape=[1], dtype='int64')
 
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
             cond = layers.less_than(x=label, y=limit)
             ie = layers.IfElse(cond)
 
@@ -125,7 +124,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
         place = core.CPUPlace()
         exe = Executor(place)
 
-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
@@ -133,7 +132,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                 y_data = y_data.reshape((y_data.shape[0], 1))
 
-                outs = exe.run(kwargs['main_program'],
+                outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
@@ -143,6 +142,67 @@ class TestMNISTIfElseOp(unittest.TestCase):
         self.assertFalse(True)
 
 
+class TestIfElse(unittest.TestCase):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 0.5
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+    def compare_ifelse_op_and_numpy(self, place):
+        self.set_test_case()
+
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            src = layers.data(name='data', shape=[1], dtype='float32')
+            cond = layers.fill_constant(
+                [1], dtype='float32', value=self.cond_value)
+            ifcond = layers.less_than(x=src, y=cond)
+            ie = layers.IfElse(ifcond)
+            with ie.true_block():
+                true_target = ie.input(src)
+                ie.output(true_target)
+
+            with ie.false_block():
+                false_target = ie.input(src)
+                ie.output(false_target)
+            if_out = ie()
+            out = layers.reduce_sum(if_out)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetch_list = [out]
+            o1, = exe.run(fluid.default_main_program(),
+                          feed={'data': self.data},
+                          fetch_list=[out])
+            o2 = np.sum(self.data)
+            self.assertTrue(
+                np.allclose(
+                    o1, o2, atol=1e-8),
+                "IfElse result : " + str(o1) + "\n Numpy result :" + str(o2))
+
+    def test_cpu(self):
+        self.compare_ifelse_op_and_numpy(fluid.CPUPlace())
+
+    def test_cuda(self):
+        if not core.is_compiled_with_cuda():
+            return
+        self.compare_ifelse_op_and_numpy(fluid.CUDAPlace(0))
+
+
+class TestIfElseTrueBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
+class TestIfElseFalseBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = -10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
 if __name__ == '__main__':
-    # temp disable if else unittest since it could be buggy.
-    exit(0)
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 5506443733650631fe045be3f701a41519352e8d..a0cb39070bf7a89e3ea4cb1d31f54f919d6ff74e 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,16 +1,13 @@
 from setuptools import setup, Distribution, Extension
 import subprocess
-import shutil
 import os
+import re
+import shutil
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
-MAJOR   = 0
-MINOR   = 14
-PATCH   = 0
 RC      = 0
-ISTAGED = False
 
 
 
@@ -22,14 +19,47 @@ def git_commit():
         git_commit = 'Unknown'
     return git_commit
 
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
+        version_details = '@PADDLE_VERSION@'.split('.')
+
+        if len(version_details) == 3:
+            return version_details[idx]
+
+    return 0
+
+def get_major():
+    return int(_get_version_detail(0))
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+def is_taged():
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags']
+        git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        return False
+
+    if git_tag.replace('v', '') == '@PADDLE_VERSION@':
+        return True
+    else:
+        return False
+
 def write_version_py(filename='paddle/version.py'):
     cnt = '''
 # THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
-full_version    = '%(major)d.%(minor)d.%(patch)d'
+full_version    = '%(major)d.%(minor)d.%(patch)s'
 major           = '%(major)d'
 minor           = '%(minor)d'
-patch           = '%(patch)d'
+patch           = '%(patch)s'
 rc              = '%(rc)d'
 istaged         = %(istaged)s
 commit          = '%(commit)s'
@@ -51,13 +81,13 @@ def mkl():
     commit = git_commit()
     with open(filename, 'w') as f:
         f.write(cnt % {
-            'major': MAJOR,
-            'minor': MINOR,
-            'patch': PATCH,
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
             'rc': RC,
             'version': '${PADDLE_VERSION}',
             'commit': commit,
-            'istaged': ISTAGED,
+            'istaged': is_taged(),
             'with_mkl': '@WITH_MKL@'})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')