diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 59e6269ea04cf4dfeb2dddee1f256acf8b5a742a..638bd0db9d7025199c31a9327b96062512aa5adb 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace framework {
@@ -52,7 +53,9 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t,
+                    platform::float16>
+      functor;
   size_t size = functor(type);
   PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
   return size;
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index c36bfad4bc155877f734f9faec9f56588206d284..cf6a4b09dbd2d5b7d22081ff4713e3e644f4800e 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -62,6 +62,7 @@ limitations under the License. */
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 
 namespace paddle {
+namespace platform {
 
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
@@ -71,11 +72,21 @@ struct PADDLE_ALIGN(2) float16 {
  public:
   uint16_t x;
 
-  // Constructors
-  HOSTDEVICE inline float16() : x(0) {}
+  // The following defaulted special class member functions
+  // are added to make float16 pass the std::is_trivial test
+  HOSTDEVICE inline float16() = default;
 
-  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+  HOSTDEVICE inline float16(const float16&) = default;
 
+  HOSTDEVICE inline float16& operator=(const float16&) = default;
+
+  HOSTDEVICE inline float16(float16&&) = default;
+
+  HOSTDEVICE inline float16& operator=(float16&&) = default;
+
+  HOSTDEVICE inline ~float16() = default;
+
+// Constructors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
 #if CUDA_VERSION >= 9000
@@ -136,11 +147,6 @@ struct PADDLE_ALIGN(2) float16 {
   HOSTDEVICE inline explicit float16(const T& val)
       : x(float16(static_cast<float>(val)).x) {}
 
-  HOSTDEVICE inline float16& operator=(const float16& rhs) {
-    x = rhs.x;
-    return *this;
-  }
-
 // Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
@@ -727,4 +733,25 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
   return float(a) >= float(b);
 }
 #endif
+
+}  // namespace platform
 }  // namespace paddle
+
+namespace std {
+
+// Override the std::is_pod::value for float16
+// The reason is that different compilers implemented std::is_pod based on
+// different C++ standards. float16 class is a plain old data in C++11 given
+// that it is both trivial and standard_layout.
+// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
+// more restricted in that you cannot provide any customized
+// constructor in float16. Hence, we override is_pod here following C++11
+// so that .cu files can be successfully compiled by nvcc.
+template <>
+struct is_pod<paddle::platform::float16> {
+  static const bool value =
+      is_trivial<paddle::platform::float16>::value &&
+      is_standard_layout<paddle::platform::float16>::value;
+};
+
+}  // namespace std
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index bed29dbfa7ed57ac98ff9ce37945cc74a8968704..b716ad9df41330bd6e22937381d24e33fa3a7914 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -10,10 +10,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 #include <gtest/gtest.h>
 
 namespace paddle {
+namespace platform {
 
 TEST(float16, conversion_cpu) {
   // Explicit conversion from Eigen::half
@@ -54,13 +57,9 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(float16(true).x, 0x3c00);
   EXPECT_EQ(float16(false).x, 0x0000);
 
-  // Default constructor
-  float16 v_def;
-  EXPECT_EQ(v_def.x, 0x0000);
-
   // Assignment operator
   float16 v_assign;
-  v_assign = v_def;
+  v_assign = float16(0);
   EXPECT_EQ(v_assign.x, 0x0000);
   v_assign = Eigen::half(1.0f);
   EXPECT_EQ(v_assign.x, 0x3c00);
@@ -116,4 +115,27 @@ TEST(float16, comparison_cpu) {
   EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
 }
 
+TEST(float16, lod_tensor_cpu) {
+  framework::LoDTensor lod_tensor;
+
+  std::vector<float16> input_data = {float16(1.0f), float16(0.5f),
+                                     float16(0.33333f), float16(0.0f)};
+  EXPECT_EQ(input_data[0].x, 0x3c00);
+  EXPECT_EQ(input_data[1].x, 0x3800);
+  EXPECT_EQ(input_data[2].x, 0x3555);
+  EXPECT_EQ(input_data[3].x, 0x0000);
+
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(framework::LoD({{0, 2, 4}}));
+  float16* data_ptr = lod_tensor.mutable_data<float16>(CPUPlace());
+
+  EXPECT_NE(data_ptr, nullptr);
+  EXPECT_EQ(input_data.size(), static_cast<size_t>(lod_tensor.numel()));
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    data_ptr[i] = input_data[i];
+    EXPECT_EQ(data_ptr[i].x, input_data[i].x);
+  }
+}
+
+}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 7e6c9f58aca3a73fa260be375275c8e4886d2133..567209df4edc483bcb5c6264c62034ddff50c413 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -13,6 +13,8 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
@@ -108,6 +110,7 @@ limitations under the License. */
 
 #ifdef PADDLE_CUDA_FP16
 namespace paddle {
+namespace platform {
 
 #if CUDA_VERSION < 9000
 ARITHMETIC_KERNEL(Add, +)
@@ -209,5 +212,35 @@ TEST(float16, conversion_on_gpu) {
   EXPECT_EQ(v_assign.x, 0x3c00);
 }
 
+TEST(float16, lod_tensor_on_gpu) {
+  framework::LoDTensor src_tensor;
+  framework::LoDTensor gpu_tensor;
+  framework::LoDTensor dst_tensor;
+
+  float16* src_ptr = src_tensor.mutable_data<float16>(
+      framework::make_ddim({2, 2}), CPUPlace());
+
+  float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f),
+                    float16(0.0f)};
+  memcpy(src_ptr, arr, 4 * sizeof(float16));
+
+  // CPU LoDTensor to GPU LoDTensor
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext gpu_ctx(gpu_place);
+  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
+
+  // GPU LoDTensor to CPU LoDTensor
+  framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor);
+
+  // Sync before comparing LoDTensors
+  gpu_ctx.Wait();
+  const float16* dst_ptr = dst_tensor.data<float16>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 4; ++i) {
+    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
+  }
+}
+
+}  // namespace platform
 }  // namespace paddle
 #endif  // PADDLE_CUDA_FP16