From 24c088f6a23c443da198228cfcab10dcf2593b62 Mon Sep 17 00:00:00 2001
From: felixhjh <852142024@qq.com>
Date: Sun, 27 Feb 2022 18:01:59 +0000
Subject: [PATCH] add PaddleVideo demo PPTSN

---
 .../PPTSN_K400/Kinetics-400_label_list.txt    | 400 ++++++++++
 .../PaddleVideo/PPTSN_K400/config.yml         |  55 ++
 .../PPTSN_K400/pipeline_http_client.py        |  23 +
 .../PaddleVideo/PPTSN_K400/web_service.py     | 102 +++
 python/paddle_serving_app/reader/__init__.py  |   1 +
 .../paddle_serving_app/reader/pptsn_reader.py | 693 ++++++++++++++++++
 6 files changed, 1274 insertions(+)
 create mode 100644 examples/Pipeline/PaddleVideo/PPTSN_K400/Kinetics-400_label_list.txt
 create mode 100644 examples/Pipeline/PaddleVideo/PPTSN_K400/config.yml
 create mode 100644 examples/Pipeline/PaddleVideo/PPTSN_K400/pipeline_http_client.py
 create mode 100644 examples/Pipeline/PaddleVideo/PPTSN_K400/web_service.py
 create mode 100644 python/paddle_serving_app/reader/pptsn_reader.py

diff --git a/examples/Pipeline/PaddleVideo/PPTSN_K400/Kinetics-400_label_list.txt b/examples/Pipeline/PaddleVideo/PPTSN_K400/Kinetics-400_label_list.txt
new file mode 100644
index 00000000..8488908b
--- /dev/null
+++ b/examples/Pipeline/PaddleVideo/PPTSN_K400/Kinetics-400_label_list.txt
@@ -0,0 +1,400 @@
+0 abseiling
+1 air_drumming
+2 answering_questions
+3 applauding
+4 applying_cream
+5 archery
+6 arm_wrestling
+7 arranging_flowers
+8 assembling_computer
+9 auctioning
+10 baby_waking_up
+11 baking_cookies
+12 balloon_blowing
+13 bandaging
+14 barbequing
+15 bartending
+16 beatboxing
+17 bee_keeping
+18 belly_dancing
+19 bench_pressing
+20 bending_back
+21 bending_metal
+22 biking_through_snow
+23 blasting_sand
+24 blowing_glass
+25 blowing_leaves
+26 blowing_nose
+27 blowing_out_candles
+28 bobsledding
+29 bookbinding
+30 bouncing_on_trampoline
+31 bowling
+32 braiding_hair
+33 breading_or_breadcrumbing
+34 breakdancing
+35 brush_painting
+36 brushing_hair
+37 brushing_teeth
+38 building_cabinet
+39 building_shed
+40 bungee_jumping
+41 busking
+42 canoeing_or_kayaking
+43 capoeira
+44 carrying_baby
+45 cartwheeling
+46 carving_pumpkin
+47 catching_fish
+48 catching_or_throwing_baseball
+49 catching_or_throwing_frisbee
+50 catching_or_throwing_softball
+51 celebrating
+52 changing_oil
+53 changing_wheel
+54 checking_tires
+55 cheerleading
+56 chopping_wood
+57 clapping
+58 clay_pottery_making
+59 clean_and_jerk
+60 cleaning_floor
+61 cleaning_gutters
+62 cleaning_pool
+63 cleaning_shoes
+64 cleaning_toilet
+65 cleaning_windows
+66 climbing_a_rope
+67 climbing_ladder
+68 climbing_tree
+69 contact_juggling
+70 cooking_chicken
+71 cooking_egg
+72 cooking_on_campfire
+73 cooking_sausages
+74 counting_money
+75 country_line_dancing
+76 cracking_neck
+77 crawling_baby
+78 crossing_river
+79 crying
+80 curling_hair
+81 cutting_nails
+82 cutting_pineapple
+83 cutting_watermelon
+84 dancing_ballet
+85 dancing_charleston
+86 dancing_gangnam_style
+87 dancing_macarena
+88 deadlifting
+89 decorating_the_christmas_tree
+90 digging
+91 dining
+92 disc_golfing
+93 diving_cliff
+94 dodgeball
+95 doing_aerobics
+96 doing_laundry
+97 doing_nails
+98 drawing
+99 dribbling_basketball
+100 drinking
+101 drinking_beer
+102 drinking_shots
+103 driving_car
+104 driving_tractor
+105 drop_kicking
+106 drumming_fingers
+107 dunking_basketball
+108 dying_hair
+109 eating_burger
+110 eating_cake
+111 eating_carrots
+112 eating_chips
+113 eating_doughnuts
+114 eating_hotdog
+115 eating_ice_cream
+116 eating_spaghetti
+117 eating_watermelon
+118 egg_hunting
+119 exercising_arm
+120 exercising_with_an_exercise_ball
+121 extinguishing_fire
+122 faceplanting
+123 feeding_birds
+124 feeding_fish
+125 feeding_goats
+126 filling_eyebrows
+127 finger_snapping
+128 fixing_hair
+129 flipping_pancake
+130 flying_kite
+131 folding_clothes
+132 folding_napkins
+133 folding_paper
+134 front_raises
+135 frying_vegetables
+136 garbage_collecting
+137 gargling
+138 getting_a_haircut
+139 getting_a_tattoo
+140 giving_or_receiving_award
+141 golf_chipping
+142 golf_driving
+143 golf_putting
+144 grinding_meat
+145 grooming_dog
+146 grooming_horse
+147 gymnastics_tumbling
+148 hammer_throw
+149 headbanging
+150 headbutting
+151 high_jump
+152 high_kick
+153 hitting_baseball
+154 hockey_stop
+155 holding_snake
+156 hopscotch
+157 hoverboarding
+158 hugging
+159 hula_hooping
+160 hurdling
+161 hurling_(sport)
+162 ice_climbing
+163 ice_fishing
+164 ice_skating
+165 ironing
+166 javelin_throw
+167 jetskiing
+168 jogging
+169 juggling_balls
+170 juggling_fire
+171 juggling_soccer_ball
+172 jumping_into_pool
+173 jumpstyle_dancing
+174 kicking_field_goal
+175 kicking_soccer_ball
+176 kissing
+177 kitesurfing
+178 knitting
+179 krumping
+180 laughing
+181 laying_bricks
+182 long_jump
+183 lunge
+184 making_a_cake
+185 making_a_sandwich
+186 making_bed
+187 making_jewelry
+188 making_pizza
+189 making_snowman
+190 making_sushi
+191 making_tea
+192 marching
+193 massaging_back
+194 massaging_feet
+195 massaging_legs
+196 massaging_person's_head
+197 milking_cow
+198 mopping_floor
+199 motorcycling
+200 moving_furniture
+201 mowing_lawn
+202 news_anchoring
+203 opening_bottle
+204 opening_present
+205 paragliding
+206 parasailing
+207 parkour
+208 passing_American_football_(in_game)
+209 passing_American_football_(not_in_game)
+210 peeling_apples
+211 peeling_potatoes
+212 petting_animal_(not_cat)
+213 petting_cat
+214 picking_fruit
+215 planting_trees
+216 plastering
+217 playing_accordion
+218 playing_badminton
+219 playing_bagpipes
+220 playing_basketball
+221 playing_bass_guitar
+222 playing_cards
+223 playing_cello
+224 playing_chess
+225 playing_clarinet
+226 playing_controller
+227 playing_cricket
+228 playing_cymbals
+229 playing_didgeridoo
+230 playing_drums
+231 playing_flute
+232 playing_guitar
+233 playing_harmonica
+234 playing_harp
+235 playing_ice_hockey
+236 playing_keyboard
+237 playing_kickball
+238 playing_monopoly
+239 playing_organ
+240 playing_paintball
+241 playing_piano
+242 playing_poker
+243 playing_recorder
+244 playing_saxophone
+245 playing_squash_or_racquetball
+246 playing_tennis
+247 playing_trombone
+248 playing_trumpet
+249 playing_ukulele
+250 playing_violin
+251 playing_volleyball
+252 playing_xylophone
+253 pole_vault
+254 presenting_weather_forecast
+255 pull_ups
+256 pumping_fist
+257 pumping_gas
+258 punching_bag
+259 punching_person_(boxing)
+260 push_up
+261 pushing_car
+262 pushing_cart
+263 pushing_wheelchair
+264 reading_book
+265 reading_newspaper
+266 recording_music
+267 riding_a_bike
+268 riding_camel
+269 riding_elephant
+270 riding_mechanical_bull
+271 riding_mountain_bike
+272 riding_mule
+273 riding_or_walking_with_horse
+274 riding_scooter
+275 riding_unicycle
+276 ripping_paper
+277 robot_dancing
+278 rock_climbing
+279 rock_scissors_paper
+280 roller_skating
+281 running_on_treadmill
+282 sailing
+283 salsa_dancing
+284 sanding_floor
+285 scrambling_eggs
+286 scuba_diving
+287 setting_table
+288 shaking_hands
+289 shaking_head
+290 sharpening_knives
+291 sharpening_pencil
+292 shaving_head
+293 shaving_legs
+294 shearing_sheep
+295 shining_shoes
+296 shooting_basketball
+297 shooting_goal_(soccer)
+298 shot_put
+299 shoveling_snow
+300 shredding_paper
+301 shuffling_cards
+302 side_kick
+303 sign_language_interpreting
+304 singing
+305 situp
+306 skateboarding
+307 ski_jumping
+308 skiing_(not_slalom_or_crosscountry)
+309 skiing_crosscountry
+310 skiing_slalom
+311 skipping_rope
+312 skydiving
+313 slacklining
+314 slapping
+315 sled_dog_racing
+316 smoking
+317 smoking_hookah
+318 snatch_weight_lifting
+319 sneezing
+320 sniffing
+321 snorkeling
+322 snowboarding
+323 snowkiting
+324 snowmobiling
+325 somersaulting
+326 spinning_poi
+327 spray_painting
+328 spraying
+329 springboard_diving
+330 squat
+331 sticking_tongue_out
+332 stomping_grapes
+333 stretching_arm
+334 stretching_leg
+335 strumming_guitar
+336 surfing_crowd
+337 surfing_water
+338 sweeping_floor
+339 swimming_backstroke
+340 swimming_breast_stroke
+341 swimming_butterfly_stroke
+342 swing_dancing
+343 swinging_legs
+344 swinging_on_something
+345 sword_fighting
+346 tai_chi
+347 taking_a_shower
+348 tango_dancing
+349 tap_dancing
+350 tapping_guitar
+351 tapping_pen
+352 tasting_beer
+353 tasting_food
+354 testifying
+355 texting
+356 throwing_axe
+357 throwing_ball
+358 throwing_discus
+359 tickling
+360 tobogganing
+361 tossing_coin
+362 tossing_salad
+363 training_dog
+364 trapezing
+365 trimming_or_shaving_beard
+366 trimming_trees
+367 triple_jump
+368 tying_bow_tie
+369 tying_knot_(not_on_a_tie)
+370 tying_tie
+371 unboxing
+372 unloading_truck
+373 using_computer
+374 using_remote_controller_(not_gaming)
+375 using_segway
+376 vault
+377 waiting_in_line
+378 walking_the_dog
+379 washing_dishes
+380 washing_feet
+381 washing_hair
+382 washing_hands
+383 water_skiing
+384 water_sliding
+385 watering_plants
+386 waxing_back
+387 waxing_chest
+388 waxing_eyebrows
+389 waxing_legs
+390 weaving_basket
+391 welding
+392 whistling
+393 windsurfing
+394 wrapping_present
+395 wrestling
+396 writing
+397 yawning
+398 yoga
+399 zumba
diff --git a/examples/Pipeline/PaddleVideo/PPTSN_K400/config.yml b/examples/Pipeline/PaddleVideo/PPTSN_K400/config.yml
new file mode 100644
index 00000000..44014150
--- /dev/null
+++ b/examples/Pipeline/PaddleVideo/PPTSN_K400/config.yml
@@ -0,0 +1,55 @@
+#rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 18090
+
+#http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 9999
+
+#worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+##当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 20
+
+#build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
+
+dag:
+    #op资源类型, True, 为线程模型；False，为进程模型
+    is_thread_op: False
+
+    #重试次数
+    retry: 1
+
+    #使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+    use_profile: false
+    tracer:
+        interval_s: 10
+
+op:
+    ppTSN:
+        #并发数，is_thread_op=True时，为线程并发；否则为进程并发
+        concurrency: 1
+
+        #当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+        local_service_conf:
+            #client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+            client_type: local_predictor
+
+            #det模型路径
+            model_config: serving_server
+
+            #Fetch结果列表，以client_config中fetch_var的alias_name为准
+            fetch_list: ["linear_1.tmp_1"]
+            
+            # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+            device_type: 1
+
+            #计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+            devices: "0"
+
+            #use_mkldnn
+            #use_mkldnn: True
+
+            #thread_num
+            thread_num: 2
+
+            #ir_optim
+            ir_optim: True
diff --git a/examples/Pipeline/PaddleVideo/PPTSN_K400/pipeline_http_client.py b/examples/Pipeline/PaddleVideo/PPTSN_K400/pipeline_http_client.py
new file mode 100644
index 00000000..e1025598
--- /dev/null
+++ b/examples/Pipeline/PaddleVideo/PPTSN_K400/pipeline_http_client.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# from paddle_serving_server.pipeline import PipelineClient
+import requests
+import json
+
+url = "http://127.0.0.1:9999/ppTSN/prediction"
+video_url = "https://paddle-serving.bj.bcebos.com/huangjianhui04/example.avi"
+for i in range(4):
+    data = {"key": ["filename"], "value": [video_url]}
+    r = requests.post(url=url, data=json.dumps(data))
+    print(r.json())
diff --git a/examples/Pipeline/PaddleVideo/PPTSN_K400/web_service.py b/examples/Pipeline/PaddleVideo/PPTSN_K400/web_service.py
new file mode 100644
index 00000000..1c670926
--- /dev/null
+++ b/examples/Pipeline/PaddleVideo/PPTSN_K400/web_service.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle_serving_server.web_service import WebService, Op
+import logging
+import numpy as np
+import cv2
+import base64
+from paddle_serving_app.reader import VideoDecoder, Sampler, Scale, TenCrop, Image2Array, Normalization
+import urllib.request
+_LOGGER = logging.getLogger()
+
+
+class ppTSNOp(Op):
+    def init_op(self,
+                num_seg=25,
+                seg_len=1,
+                short_size=256,
+                target_size=224,
+                top_k=1):
+        self.top_k = top_k
+        img_mean = [0.485, 0.456, 0.406]
+        img_std = [0.229, 0.224, 0.225]
+        self.ops = [
+            VideoDecoder(),
+            Sampler(num_seg, seg_len, valid_mode=True, select_left=True),
+            Scale(short_size, fixed_ratio=True, do_round=True, backend='cv2'),
+            TenCrop(target_size),
+            Image2Array(),
+            Normalization(img_mean, img_std)                                
+        ]
+        self.label_dict = {}
+        with open("Kinetics-400_label_list.txt") as fin:
+            for line in fin:
+                label_list = line.strip().split(" ")
+                index = int(label_list[0])
+                label = label_list[1]
+                self.label_dict[index] = label
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        (_, input_dict), = input_dicts.items()
+        self.input_file = []
+        for key in input_dict.keys():
+            try:
+                filename = urllib.request.urlretrieve(input_dict[key], key)
+                self.input_file.append(filename[0])
+                print("download video success")
+            except:
+                print("download video failed")
+        batched_inputs = []
+        for filename in self.input_file:
+            results = {'filename': filename}
+            for op in self.ops:
+                results = op(results)
+            res = np.expand_dims(results['imgs'], axis=0).copy()
+            batched_inputs.append(res)
+        batched_inputs = [
+            np.concatenate([item[i] for item in batched_inputs])
+            for i in range(len(batched_inputs[0]))
+        ]
+        return {"data_batch_0": batched_inputs[0]}, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        output = fetch_dict["linear_1.tmp_1"]
+        if not isinstance(self.input_file, list):
+            self.input_file = [
+                self.input_file,
+            ]
+        N = len(self.input_file)
+        if output.shape[0] != N:
+            output = output.reshape([N] + [output.shape[0] // N] +
+                                    list(output.shape[1:]))  # [N, T, C]
+            output = output.mean(axis=1)  # [N, C]
+        import paddle
+        import paddle.nn.functional as F
+        output = F.softmax(paddle.to_tensor(output), axis=-1).numpy()
+        for i in range(N):
+            classes = np.argpartition(output[i], -self.top_k)[-self.top_k:]
+            classes = classes[np.argsort(-output[i, classes])]
+            labels = [self.label_dict[label] for label in classes.tolist()]
+            scores = output[i, classes]
+        res = {"res" :"class: {} score: {}".format(labels, scores)}
+        return res, None, ""
+
+class ppTSNService(WebService):
+    def get_pipeline_response(self, read_op):
+        ppTSN_op = ppTSNOp(name="ppTSN", input_ops=[read_op])
+        return ppTSN_op
+
+pptsn_service = ppTSNService(name="ppTSN")
+pptsn_service.prepare_pipeline_config("config.yml")
+pptsn_service.run_service()
diff --git a/python/paddle_serving_app/reader/__init__.py b/python/paddle_serving_app/reader/__init__.py
index 185b2099..8d782595 100644
--- a/python/paddle_serving_app/reader/__init__.py
+++ b/python/paddle_serving_app/reader/__init__.py
@@ -21,3 +21,4 @@ from .lac_reader import LACReader
 from .senta_reader import SentaReader
 #from .imdb_reader import IMDBDataset
 from .ocr_reader import OCRReader
+from .pptsn_reader import VideoDecoder, Sampler, Scale, TenCrop, Image2Array, Normalization
diff --git a/python/paddle_serving_app/reader/pptsn_reader.py b/python/paddle_serving_app/reader/pptsn_reader.py
new file mode 100644
index 00000000..c74c9db6
--- /dev/null
+++ b/python/paddle_serving_app/reader/pptsn_reader.py
@@ -0,0 +1,693 @@
+import numpy as np
+import av
+import cv2
+import pickle
+import decord as de
+import math
+import random
+
+import os
+
+from PIL import Image
+import SimpleITK as sitk
+
+
+try:
+    import cPickle as pickle
+    from cStringIO import StringIO
+except ImportError:
+    import pickle
+    from io import BytesIO
+
+from collections.abc import Sequence
+
+
+class VideoDecoder(object):
+    """
+    Decode mp4 file to frames.
+    Args:
+        filepath: the file path of mp4 file
+    """
+    def __init__(self,
+                 backend='cv2',
+                 mode='train',
+                 sampling_rate=32,
+                 num_seg=8,
+                 num_clips=1,
+                 target_fps=30):
+
+        self.backend = backend
+        # params below only for TimeSformer
+        self.mode = mode
+        self.sampling_rate = sampling_rate
+        self.num_seg = num_seg
+        self.num_clips = num_clips
+        self.target_fps = target_fps
+
+    def __call__(self, results):
+        """
+        Perform mp4 decode operations.
+        return:
+            List where each item is a numpy array after decoder.
+        """
+        file_path = results['filename']
+        results['format'] = 'video'
+        results['backend'] = self.backend
+
+        if self.backend == 'cv2':
+            cap = cv2.VideoCapture(file_path)
+            videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            sampledFrames = []
+            for i in range(videolen):
+                ret, frame = cap.read()
+                # maybe first frame is empty
+                if ret == False:
+                    continue
+                img = frame[:, :, ::-1]
+                sampledFrames.append(img)
+            results['frames'] = sampledFrames
+            results['frames_len'] = len(sampledFrames)
+
+        elif self.backend == 'decord':
+            container = de.VideoReader(file_path)
+            frames_len = len(container)
+            results['frames'] = container
+            results['frames_len'] = frames_len
+
+        elif self.backend == 'pyav':  # for TimeSformer
+            if self.mode in ["train", "valid"]:
+                clip_idx = -1
+            elif self.mode in ["test"]:
+                clip_idx = 0
+            else:
+                raise NotImplementedError
+
+            container = av.open(file_path)
+
+            num_clips = 1  # always be 1
+
+            # decode process
+            fps = float(container.streams.video[0].average_rate)
+
+            frames_length = container.streams.video[0].frames
+            duration = container.streams.video[0].duration
+
+            if duration is None:
+                # If failed to fetch the decoding information, decode the entire video.
+                decode_all_video = True
+                video_start_pts, video_end_pts = 0, math.inf
+            else:
+                decode_all_video = False
+                start_idx, end_idx = get_start_end_idx(
+                    frames_length,
+                    self.sampling_rate * self.num_seg / self.target_fps * fps,
+                    clip_idx, num_clips)
+                timebase = duration / frames_length
+                video_start_pts = int(start_idx * timebase)
+                video_end_pts = int(end_idx * timebase)
+
+            frames = None
+            # If video stream was found, fetch video frames from the video.
+            if container.streams.video:
+                margin = 1024
+                seek_offset = max(video_start_pts - margin, 0)
+
+                container.seek(seek_offset,
+                               any_frame=False,
+                               backward=True,
+                               stream=container.streams.video[0])
+                tmp_frames = {}
+                buffer_count = 0
+                max_pts = 0
+                for frame in container.decode(**{"video": 0}):
+                    max_pts = max(max_pts, frame.pts)
+                    if frame.pts < video_start_pts:
+                        continue
+                    if frame.pts <= video_end_pts:
+                        tmp_frames[frame.pts] = frame
+                    else:
+                        buffer_count += 1
+                        tmp_frames[frame.pts] = frame
+                        if buffer_count >= 0:
+                            break
+                video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]
+
+                container.close()
+
+                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+                clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps
+
+                start_idx, end_idx = get_start_end_idx(
+                    len(frames),  # frame_len
+                    clip_sz,
+                    clip_idx if decode_all_video else
+                    0,  # If decode all video, -1 in train and valid, 0 in test;
+                    # else, always 0 in train, valid and test, as we has selected clip size frames when decode.
+                    1)
+                results['frames'] = frames
+                results['frames_len'] = len(frames)
+                results['start_idx'] = start_idx
+                results['end_idx'] = end_idx
+        else:
+            raise NotImplementedError
+        return results
+
+
+class Sampler(object):
+    """
+    Sample frames id.
+    NOTE: Use PIL to read image here, has diff with CV2
+    Args:
+        num_seg(int): number of segments.
+        seg_len(int): number of sampled frames in each segment.
+        valid_mode(bool): True or False.
+        select_left: Whether to select the frame to the left in the middle when the sampling interval is even in the test mode.
+    Returns:
+        frames_idx: the index of sampled #frames.
+    """
+    def __init__(self,
+                 num_seg,
+                 seg_len,
+                 frame_interval=None,
+                 valid_mode=False,
+                 select_left=False,
+                 dense_sample=False,
+                 linspace_sample=False,
+                 use_pil=True):
+        self.num_seg = num_seg
+        self.seg_len = seg_len
+        self.frame_interval = frame_interval
+        self.valid_mode = valid_mode
+        self.select_left = select_left
+        self.dense_sample = dense_sample
+        self.linspace_sample = linspace_sample
+        self.use_pil = use_pil
+
+    def _get(self, frames_idx, results):
+        data_format = results['format']
+
+        if data_format == "frame":
+            frame_dir = results['frame_dir']
+            imgs = []
+            for idx in frames_idx:
+                img = Image.open(
+                    os.path.join(frame_dir,
+                                 results['suffix'].format(idx))).convert('RGB')
+                imgs.append(img)
+
+        elif data_format == "MRI":
+            frame_dir = results['frame_dir']
+            imgs = []
+            MRI = sitk.GetArrayFromImage(sitk.ReadImage(frame_dir))
+            for idx in frames_idx:
+                item = MRI[idx]
+                item = cv2.resize(item, (224, 224))
+                imgs.append(item)
+
+        elif data_format == "video":
+            if results['backend'] == 'cv2':
+                frames = np.array(results['frames'])
+                imgs = []
+                for idx in frames_idx:
+                    imgbuf = frames[idx]
+                    img = Image.fromarray(imgbuf, mode='RGB')
+                    imgs.append(img)
+            elif results['backend'] == 'decord':
+                container = results['frames']
+                if self.use_pil:
+                    frames_select = container.get_batch(frames_idx)
+                    # dearray_to_img
+                    np_frames = frames_select.asnumpy()
+                    imgs = []
+                    for i in range(np_frames.shape[0]):
+                        imgbuf = np_frames[i]
+                        imgs.append(Image.fromarray(imgbuf, mode='RGB'))
+                else:
+                    if frames_idx.ndim != 1:
+                        frames_idx = np.squeeze(frames_idx)
+                    frame_dict = {
+                        idx: container[idx].asnumpy()
+                        for idx in np.unique(frames_idx)
+                    }
+                    imgs = [frame_dict[idx] for idx in frames_idx]
+            elif results['backend'] == 'pyav':
+                imgs = []
+                frames = np.array(results['frames'])
+                for idx in frames_idx:
+                    imgbuf = frames[idx]
+                    imgs.append(imgbuf)
+                imgs = np.stack(imgs)  # thwc
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+        results['imgs'] = imgs
+        return results
+
+    def _get_train_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) // self.num_seg
+
+        if avg_interval > 0:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = base_offsets + np.random.randint(avg_interval,
+                                                            size=self.num_seg)
+        elif num_frames > max(self.num_seg, ori_seg_len):
+            clip_offsets = np.sort(
+                np.random.randint(num_frames - ori_seg_len + 1,
+                                  size=self.num_seg))
+        elif avg_interval == 0:
+            ratio = (num_frames - ori_seg_len + 1.0) / self.num_seg
+            clip_offsets = np.around(np.arange(self.num_seg) * ratio)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def _get_test_clips(self, num_frames):
+        ori_seg_len = self.seg_len * self.frame_interval
+        avg_interval = (num_frames - ori_seg_len + 1) / float(self.num_seg)
+        if num_frames > ori_seg_len - 1:
+            base_offsets = np.arange(self.num_seg) * avg_interval
+            clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int)
+        else:
+            clip_offsets = np.zeros((self.num_seg, ), dtype=np.int)
+        return clip_offsets
+
+    def __call__(self, results):
+        """
+        Args:
+            frames_len: length of frames.
+        return:
+            sampling id.
+        """
+        frames_len = int(results['frames_len'])
+        frames_idx = []
+        if self.frame_interval is not None:
+            assert isinstance(self.frame_interval, int)
+            if not self.valid_mode:
+                offsets = self._get_train_clips(frames_len)
+            else:
+                offsets = self._get_test_clips(frames_len)
+
+            offsets = offsets[:, None] + np.arange(
+                self.seg_len)[None, :] * self.frame_interval
+            offsets = np.concatenate(offsets)
+
+            offsets = offsets.reshape((-1, self.seg_len))
+            offsets = np.mod(offsets, frames_len)
+            offsets = np.concatenate(offsets)
+
+            if results['format'] == 'video':
+                frames_idx = offsets
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+        if self.linspace_sample:
+            if 'start_idx' in results and 'end_idx' in results:
+                offsets = np.linspace(results['start_idx'], results['end_idx'],
+                                      self.num_seg)
+            else:
+                offsets = np.linspace(0, frames_len - 1, self.num_seg)
+            offsets = np.clip(offsets, 0, frames_len - 1).astype(np.int64)
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        average_dur = int(frames_len / self.num_seg)
+        if not self.select_left:
+            if self.dense_sample:  # For ppTSM
+                if not self.valid_mode:  # train
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_idx = 0 if sample_pos == 1 else np.random.randint(
+                        0, sample_pos - 1)
+                    offsets = [(idx * t_stride + start_idx) % frames_len + 1
+                               for idx in range(self.num_seg)]
+                    frames_idx = offsets
+                else:
+                    sample_pos = max(1, 1 + frames_len - 64)
+                    t_stride = 64 // self.num_seg
+                    start_list = np.linspace(0,
+                                             sample_pos - 1,
+                                             num=10,
+                                             dtype=int)
+                    offsets = []
+                    for start_idx in start_list.tolist():
+                        offsets += [
+                            (idx * t_stride + start_idx) % frames_len + 1
+                            for idx in range(self.num_seg)
+                        ]
+                    frames_idx = offsets
+            else:
+                for i in range(self.num_seg):
+                    idx = 0
+                    if not self.valid_mode:
+                        if average_dur >= self.seg_len:
+                            idx = random.randint(0, average_dur - self.seg_len)
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    else:
+                        if average_dur >= self.seg_len:
+                            idx = (average_dur - 1) // 2
+                            idx += i * average_dur
+                        elif average_dur >= 1:
+                            idx += i * average_dur
+                        else:
+                            idx = i
+                    for jj in range(idx, idx + self.seg_len):
+                        if results['format'] == 'video':
+                            frames_idx.append(int(jj % frames_len))
+                        elif results['format'] == 'frame':
+                            frames_idx.append(jj + 1)
+
+                        elif results['format'] == 'MRI':
+                            frames_idx.append(jj)
+                        else:
+                            raise NotImplementedError
+            return self._get(frames_idx, results)
+
+        else:  # for TSM
+            if not self.valid_mode:
+                if average_dur > 0:
+                    offsets = np.multiply(list(range(self.num_seg)),
+                                          average_dur) + np.random.randint(
+                                              average_dur, size=self.num_seg)
+                elif frames_len > self.num_seg:
+                    offsets = np.sort(
+                        np.random.randint(frames_len, size=self.num_seg))
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+            else:
+                if frames_len > self.num_seg:
+                    average_dur_float = frames_len / self.num_seg
+                    offsets = np.array([
+                        int(average_dur_float / 2.0 + average_dur_float * x)
+                        for x in range(self.num_seg)
+                    ])
+                else:
+                    offsets = np.zeros(shape=(self.num_seg, ))
+
+            if results['format'] == 'video':
+                frames_idx = list(offsets)
+                frames_idx = [x % frames_len for x in frames_idx]
+            elif results['format'] == 'frame':
+                frames_idx = list(offsets + 1)
+
+            elif results['format'] == 'MRI':
+                frames_idx = list(offsets)
+
+            else:
+                raise NotImplementedError
+
+            return self._get(frames_idx, results)
+
+class CenterCrop(object):
+    """
+    Center crop images.
+    Args:
+        target_size(int): Center crop a square with the target_size from an image.
+        do_round(bool): Whether to round up the coordinates of the upper left corner of the cropping area. default: True
+    """
+    def __init__(self, target_size, do_round=True, backend='pillow'):
+        self.target_size = target_size
+        self.do_round = do_round
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs Center crop operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            ccrop_imgs: List where each item is a PIL.Image after Center crop.
+        """
+        import paddle
+        imgs = results['imgs']
+        ccrop_imgs = []
+        th, tw = self.target_size, self.target_size
+        if isinstance(imgs, paddle.Tensor):
+            h, w = imgs.shape[-2:]
+            x1 = int(round((w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+            y1 = int(round((h - th) / 2.0)) if self.do_round else (h - th) // 2
+            ccrop_imgs = imgs[:, :, y1:y1 + th, x1:x1 + tw]
+        else:
+            for img in imgs:
+                if self.backend == 'pillow':
+                    w, h = img.size
+                elif self.backend == 'cv2':
+                    h, w, _ = img.shape
+                else:
+                    raise NotImplementedError
+                assert (w >= self.target_size) and (h >= self.target_size), \
+                    "image width({}) and height({}) should be larger than crop size".format(
+                        w, h, self.target_size)
+                x1 = int(round(
+                    (w - tw) / 2.0)) if self.do_round else (w - tw) // 2
+                y1 = int(round(
+                    (h - th) / 2.0)) if self.do_round else (h - th) // 2
+                if self.backend == 'cv2':
+                    ccrop_imgs.append(img[y1:y1 + th, x1:x1 + tw])
+                elif self.backend == 'pillow':
+                    ccrop_imgs.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        results['imgs'] = ccrop_imgs
+        return results
+
+class Scale(object):
+    """
+    Scale images.
+    Args:
+        short_size(float | int): Short size of an image will be scaled to the short_size.
+        fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
+        do_round(bool): Whether to round up when calculating the zoom ratio. default: False
+        backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
+    """
+    def __init__(self,
+                 short_size,
+                 fixed_ratio=True,
+                 keep_ratio=None,
+                 do_round=False,
+                 backend='pillow'):
+        self.short_size = short_size
+        assert (fixed_ratio and not keep_ratio) or (not fixed_ratio), \
+            f"fixed_ratio and keep_ratio cannot be true at the same time"
+        self.fixed_ratio = fixed_ratio
+        self.keep_ratio = keep_ratio
+        self.do_round = do_round
+
+        assert backend in [
+            'pillow', 'cv2'
+        ], f"Scale's backend must be pillow or cv2, but get {backend}"
+        self.backend = backend
+
+    def __call__(self, results):
+        """
+        Performs resize operations.
+        Args:
+            imgs (Sequence[PIL.Image]): List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            resized_imgs: List where each item is a PIL.Image after scaling.
+        """
+        imgs = results['imgs']
+        resized_imgs = []
+        for i in range(len(imgs)):
+            img = imgs[i]
+            if isinstance(img, np.ndarray):
+                h, w, _ = img.shape
+            elif isinstance(img, Image.Image):
+                w, h = img.size
+            else:
+                raise NotImplementedError
+
+            if w <= h:
+                ow = self.short_size
+                if self.fixed_ratio:
+                    oh = int(self.short_size * 4.0 / 3.0)
+                elif not self.keep_ratio:  # no
+                    oh = self.short_size
+                else:
+                    scale_factor = self.short_size / w
+                    oh = int(h * float(scale_factor) +
+                             0.5) if self.do_round else int(h *
+                                                            self.short_size / w)
+                    ow = int(w * float(scale_factor) +
+                             0.5) if self.do_round else int(w *
+                                                            self.short_size / h)
+            else:
+                oh = self.short_size
+                if self.fixed_ratio:
+                    ow = int(self.short_size * 4.0 / 3.0)
+                elif not self.keep_ratio:  # no
+                    ow = self.short_size
+                else:
+                    scale_factor = self.short_size / h
+                    oh = int(h * float(scale_factor) +
+                             0.5) if self.do_round else int(h *
+                                                            self.short_size / w)
+                    ow = int(w * float(scale_factor) +
+                             0.5) if self.do_round else int(w *
+                                                            self.short_size / h)
+            if self.backend == 'pillow':
+                resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
+            elif self.backend == 'cv2' and (self.keep_ratio is not None):
+                resized_imgs.append(
+                    cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR))
+            else:
+                resized_imgs.append(
+                    Image.fromarray(
+                        cv2.resize(np.asarray(img), (ow, oh),
+                                   interpolation=cv2.INTER_LINEAR)))
+        results['imgs'] = resized_imgs
+        return results
+
+class Image2Array(object):
+    """
+    transfer PIL.Image to Numpy array and transpose dimensions from 'dhwc' to 'dchw'.
+    Args:
+        transpose: whether to transpose or not, default True, False for slowfast.
+    """
+    def __init__(self, transpose=True, data_format='tchw'):
+        assert data_format in [
+            'tchw', 'cthw'
+        ], f"Target format must in ['tchw', 'cthw'], but got {data_format}"
+        self.transpose = transpose
+        self.data_format = data_format
+
+    def __call__(self, results):
+        """
+        Performs Image to NumpyArray operations.
+        Args:
+            imgs: List where each item is a PIL.Image.
+            For example, [PIL.Image0, PIL.Image1, PIL.Image2, ...]
+        return:
+            np_imgs: Numpy array.
+        """
+        imgs = results['imgs']
+        if 'backend' in results and results[
+                'backend'] == 'pyav':  # [T,H,W,C] in [0, 1]
+            if self.transpose:
+                if self.data_format == 'tchw':
+                    t_imgs = imgs.transpose((0, 3, 1, 2))  # tchw
+                else:
+                    t_imgs = imgs.transpose((3, 0, 1, 2))  # cthw
+            results['imgs'] = t_imgs
+        else:
+            t_imgs = np.stack(imgs).astype('float32')
+            if self.transpose:
+                if self.data_format == 'tchw':
+                    t_imgs = t_imgs.transpose(0, 3, 1, 2)  # tchw
+                else:
+                    t_imgs = t_imgs.transpose(3, 0, 1, 2)  # cthw
+            results['imgs'] = t_imgs
+        return results
+
+class Normalization(object):
+    """
+    Normalization.
+    Args:
+        mean(Sequence[float]): mean values of different channels.
+        std(Sequence[float]): std values of different channels.
+        tensor_shape(list): size of mean, default [3,1,1]. For slowfast, [1,1,1,3]
+    """
+    def __init__(self, mean, std, tensor_shape=[3, 1, 1], inplace=False):
+        if not isinstance(mean, Sequence):
+            raise TypeError(
+                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}')
+        if not isinstance(std, Sequence):
+            raise TypeError(
+                f'Std must be list, tuple or np.ndarray, but got {type(std)}')
+
+        self.inplace = inplace
+        if not inplace:
+            self.mean = np.array(mean).reshape(tensor_shape).astype(np.float32)
+            self.std = np.array(std).reshape(tensor_shape).astype(np.float32)
+        else:
+            self.mean = np.array(mean, dtype=np.float32)
+            self.std = np.array(std, dtype=np.float32)
+
+    def __call__(self, results):
+        """
+        Performs normalization operations.
+        Args:
+            imgs: Numpy array.
+        return:
+            np_imgs: Numpy array after normalization.
+        """
+        import paddle
+        if self.inplace:
+            n = len(results['imgs'])
+            h, w, c = results['imgs'][0].shape
+            norm_imgs = np.empty((n, h, w, c), dtype=np.float32)
+            for i, img in enumerate(results['imgs']):
+                norm_imgs[i] = img
+
+            for img in norm_imgs:  # [n,h,w,c]
+                mean = np.float64(self.mean.reshape(1, -1))  # [1, 3]
+                stdinv = 1 / np.float64(self.std.reshape(1, -1))  # [1, 3]
+                cv2.subtract(img, mean, img)
+                cv2.multiply(img, stdinv, img)
+        else:
+            imgs = results['imgs']
+            norm_imgs = imgs / 255.0
+            norm_imgs -= self.mean
+            norm_imgs /= self.std
+            if 'backend' in results and results['backend'] == 'pyav':
+                norm_imgs = paddle.to_tensor(norm_imgs, dtype=paddle.float32)
+        results['imgs'] = norm_imgs
+        return results
+
+class TenCrop:
+    """
+    Crop out 5 regions (4 corner points + 1 center point) from the picture,
+    and then flip the cropping result to get 10 cropped images, which can make the prediction result more robust.
+    Args:
+        target_size(int | tuple[int]): (w, h) of target size for crop.
+    """
+    def __init__(self, target_size):
+        self.target_size = (target_size, target_size)
+
+    def __call__(self, results):
+        imgs = results['imgs']
+        img_w, img_h = imgs[0].size
+        crop_w, crop_h = self.target_size
+        w_step = (img_w - crop_w) // 4
+        h_step = (img_h - crop_h) // 4
+        offsets = [
+            (0, 0),
+            (4 * w_step, 0),
+            (0, 4 * h_step),
+            (4 * w_step, 4 * h_step),
+            (2 * w_step, 2 * h_step),
+        ]
+        img_crops = list()
+        for x_offset, y_offset in offsets:
+            crop = [
+                img.crop(
+                    (x_offset, y_offset, x_offset + crop_w, y_offset + crop_h))
+                for img in imgs
+            ]
+            crop_fliped = [
+                timg.transpose(Image.FLIP_LEFT_RIGHT) for timg in crop
+            ]
+            img_crops.extend(crop)
+            img_crops.extend(crop_fliped)
+
+        results['imgs'] = img_crops
+        return results
-- 
GitLab