diff --git a/deploy/pphuman/pipeline.py b/deploy/pphuman/pipeline.py
index 47928b7e859b5bbbbc0abdd4de60a0c48154a8c5..e40b961829ae85b77ed98f7aeb6a57aa6255a3d0 100644
--- a/deploy/pphuman/pipeline.py
+++ b/deploy/pphuman/pipeline.py
@@ -42,7 +42,7 @@ from python.action_utils import KeyPointBuff, SkeletonActionVisualHelper
 
 from pipe_utils import argsparser, print_arguments, merge_cfg, PipeTimer
 from pipe_utils import get_test_images, crop_image_with_det, crop_image_with_mot, parse_mot_res, parse_mot_keypoint
-from python.preprocess import decode_image
+from python.preprocess import decode_image, ShortSizeScale
 from python.visualize import visualize_box_mask, visualize_attr, visualize_pose, visualize_action
 
 from pptracking.python.mot_sde_infer import SDE_Detector
@@ -554,6 +554,10 @@ class PipePredictor(object):
 
         video_action_imgs = []
 
+        if self.with_video_action:
+            short_size = self.cfg["VIDEO_ACTION"]["short_size"]
+            scale = ShortSizeScale(short_size)
+
         while (1):
             if frame_id % 10 == 0:
                 print('frame id: ', frame_id)
@@ -705,7 +709,9 @@ class PipePredictor(object):
 
                 # collect frames
                 if frame_id % sample_freq == 0:
-                    video_action_imgs.append(frame)
+                    # Scale image
+                    scaled_img = scale(frame)
+                    video_action_imgs.append(scaled_img)
 
                 # the number of collected frames is enough to predict video action
                 if len(video_action_imgs) == frame_len:
diff --git a/deploy/python/preprocess.py b/deploy/python/preprocess.py
index 315364775850dd2e19d59f226dc896cc933a328a..4703243b861d616d440e785ae59f2564086bfb13 100644
--- a/deploy/python/preprocess.py
+++ b/deploy/python/preprocess.py
@@ -15,6 +15,7 @@
 import cv2
 import numpy as np
 from keypoint_preprocess import get_affine_transform
+from PIL import Image
 
 
 def decode_image(im_file, im_info):
@@ -106,6 +107,95 @@ class Resize(object):
         return im_scale_y, im_scale_x
 
 
+class ShortSizeScale(object):
+    """
+    Scale images by short size.
+    Args:
+        short_size(float | int): Short size of an image will be scaled to the short_size.
+        fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
+        do_round(bool): Whether to round up when calculating the zoom ratio. default: False
+        backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
+    """
+
+    def __init__(self,
+                 short_size,
+                 fixed_ratio=True,
+                 keep_ratio=None,
+                 do_round=False,
+                 backend='pillow'):
+        self.short_size = short_size
+        assert (fixed_ratio and not keep_ratio) or (
+            not fixed_ratio
+        ), "fixed_ratio and keep_ratio cannot be true at the same time"
+        self.fixed_ratio = fixed_ratio
+        self.keep_ratio = keep_ratio
+        self.do_round = do_round
+
+        assert backend in [
+            'pillow', 'cv2'
+        ], "Scale's backend must be pillow or cv2, but get {backend}"
+
+        self.backend = backend
+
+    def __call__(self, img):
+        """
+        Performs resize operations.
+        Args:
+            img (PIL.Image): a PIL.Image.
+        return:
+            resized_img: a PIL.Image after scaling.
+        """
+
+        result_img = None
+
+        if isinstance(img, np.ndarray):
+            h, w, _ = img.shape
+        elif isinstance(img, Image.Image):
+            w, h = img.size
+        else:
+            raise NotImplementedError
+
+        if w <= h:
+            ow = self.short_size
+            if self.fixed_ratio:  # default is True
+                oh = int(self.short_size * 4.0 / 3.0)
+            elif not self.keep_ratio:  # no
+                oh = self.short_size
+            else:
+                scale_factor = self.short_size / w
+                oh = int(h * float(scale_factor) +
+                         0.5) if self.do_round else int(h * self.short_size / w)
+                ow = int(w * float(scale_factor) +
+                         0.5) if self.do_round else int(w * self.short_size / h)
+        else:
+            oh = self.short_size
+            if self.fixed_ratio:
+                ow = int(self.short_size * 4.0 / 3.0)
+            elif not self.keep_ratio:  # no
+                ow = self.short_size
+            else:
+                scale_factor = self.short_size / h
+                oh = int(h * float(scale_factor) +
+                         0.5) if self.do_round else int(h * self.short_size / w)
+                ow = int(w * float(scale_factor) +
+                         0.5) if self.do_round else int(w * self.short_size / h)
+
+        if type(img) == np.ndarray:
+            img = Image.fromarray(img, mode='RGB')
+
+        if self.backend == 'pillow':
+            result_img = img.resize((ow, oh), Image.BILINEAR)
+        elif self.backend == 'cv2' and (self.keep_ratio is not None):
+            result_img = cv2.resize(
+                img, (ow, oh), interpolation=cv2.INTER_LINEAR)
+        else:
+            result_img = Image.fromarray(
+                cv2.resize(
+                    np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR))
+
+        return result_img
+
+
 class NormalizeImage(object):
     """normalize image
     Args:
diff --git a/deploy/python/video_action_infer.py b/deploy/python/video_action_infer.py
index 34f63a94cb23c142c89cbb777d57a0db68c024f6..865f47d41ac708f667edfb5a022a0315d69392bd 100644
--- a/deploy/python/video_action_infer.py
+++ b/deploy/python/video_action_infer.py
@@ -197,7 +197,7 @@ class VideoActionRecognizer(object):
         img_mean = [0.485, 0.456, 0.406]
         img_std = [0.229, 0.224, 0.225]
         ops = [
-            Scale(self.short_size), CenterCrop(self.target_size), Image2Array(),
+            CenterCrop(self.target_size), Image2Array(),
             Normalization(img_mean, img_std)
         ]
         for op in ops: