diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index b61d9aeff7d4c2b92c4861444e8c4d1bb5d9d1cc..4f2d7ce3cff9e18b0d0a247d892f834984d9ad3c 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -33,8 +33,8 @@ __global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
     int ih = (tid % hw) / w;
     int iw = tid % w;
 
-    const int c1 = static_cast<T>(c * shift_ratio);
-    const int c2 = static_cast<T>(c * 2 * shift_ratio);
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
     if (ic < c1) {
       src_it = it - 1;
@@ -69,8 +69,8 @@ __global__ void KeTemporalShiftBw(const T* output_grad, T* input_grad,
     int ih = (tid % hw) / w;
     int iw = tid % w;
 
-    const int c1 = static_cast<T>(c * shift_ratio);
-    const int c2 = static_cast<T>(c * 2 * shift_ratio);
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
     if (ic < c1) {
       src_it = it - 1;
@@ -163,8 +163,11 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(temporal_shift, ops::TemporalShiftOpCUDAKernel<float>,
-                        ops::TemporalShiftOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(temporal_shift_grad,
-                        ops::TemporalShiftGradOpCUDAKernel<float>,
-                        ops::TemporalShiftGradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    temporal_shift, ops::TemporalShiftOpCUDAKernel<float>,
+    ops::TemporalShiftOpCUDAKernel<double>,
+    ops::TemporalShiftOpCUDAKernel<paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    temporal_shift_grad, ops::TemporalShiftGradOpCUDAKernel<float>,
+    ops::TemporalShiftGradOpCUDAKernel<double>,
+    ops::TemporalShiftGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 12eec2073b3d041d58cee19226e7dc05ae03d4d0..050c38e5499be8628ebefbc1d13aba81dde195c6 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -40,7 +40,7 @@ class TestTemporalShift(OpTest):
     def setUp(self):
         self.initTestCase()
         self.op_type = 'temporal_shift'
-        x = np.random.random(self.x_shape).astype('float64')
+        x = np.random.random(self.x_shape).astype(self.dtype)
 
         self.attrs = {
             "seg_num": self.seg_num,
@@ -62,6 +62,7 @@ class TestTemporalShift(OpTest):
         self.x_shape = (6, 4, 4, 4)
         self.seg_num = 3
         self.shift_ratio = 0.25
+        self.dtype = 'float64'
 
 
 class TestTemporalShift2(TestTemporalShift):
@@ -78,6 +79,26 @@ class TestTemporalShift3(TestTemporalShift):
         self.shift_ratio = 0.3
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestTemporalShiftFP16(TestTemporalShift):
+    def initTestCase(self):
+        self.x_shape = (3, 10, 5, 5)
+        self.seg_num = 1
+        self.shift_ratio = 0.3
+        self.dtype = 'float16'
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place)
+
+    def test_check_grad_ignore_uv(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestTemporalShiftAPI(unittest.TestCase):
     def test_api(self):
         input = paddle.randn([6, 4, 2, 2])