diff --git a/hub_module/modules/video/classification/videotag_tsn_lstm/README.md b/hub_module/modules/video/classification/videotag_tsn_lstm/README.md
index 2748164a7ccc911da85852ffa46e985701e4c6c7..225b51ac86ed2c0f72bbf27658b2100f8c823436 100644
--- a/hub_module/modules/video/classification/videotag_tsn_lstm/README.md
+++ b/hub_module/modules/video/classification/videotag_tsn_lstm/README.md
@@ -1,9 +1,8 @@
```shell
$ hub install videotag_tsn_lstm==1.0.0
```
-
-
-
+![image](https://paddlehub.bj.bcebos.com/model/video/video_classifcation/VideoTag_TSN_AttentionLSTM.png)
+
具体网络结构可参考论文[TSN](https://arxiv.org/abs/1608.00859)和[AttentionLSTM](https://arxiv.org/abs/1503.08909)。
## 命令行预测示例
@@ -16,10 +15,10 @@ hub run videotag_tsn_lstm --input_path 1.mp4 --use_gpu False
## API
```python
-def classification(paths,
- use_gpu=False,
- threshold=0.5,
- top_k=10)
+def classify(paths,
+ use_gpu=False,
+ threshold=0.5,
+ top_k=10)
```
用于视频分类预测
@@ -46,9 +45,8 @@ import paddlehub as hub
videotag = hub.Module(name="videotag_tsn_lstm")
# execute predict and print the result
-results = videotag.classification(paths=["1.mp4","2.mp4"], use_gpu=True)
-for result in results:
- print(result)
+results = videotag.classify(paths=["1.mp4","2.mp4"], use_gpu=True)
+print(results)
```
## 依赖
diff --git a/hub_module/modules/video/classification/videotag_tsn_lstm/module.py b/hub_module/modules/video/classification/videotag_tsn_lstm/module.py
index dc70f8f200d38e0d2052cb4519d5d884dcacbc72..f0988172e81d1594e10c5373637daa5493f3906b 100644
--- a/hub_module/modules/video/classification/videotag_tsn_lstm/module.py
+++ b/hub_module/modules/video/classification/videotag_tsn_lstm/module.py
@@ -88,12 +88,9 @@ class VideoTag(hub.Module):
extractor_model.load_test_weights(exe, args.extractor_weights,
extractor_main_prog)
- # get reader and metrics
- extractor_reader = get_reader("TSN", 'infer',
- extractor_infer_config)
extractor_feeder = fluid.DataFeeder(
place=place, feed_list=extractor_feeds)
- return extractor_reader, extractor_main_prog, extractor_fetch_list, extractor_feeder, extractor_scope
+ return extractor_main_prog, extractor_fetch_list, extractor_feeder, extractor_scope
def _predictor(self, args, exe, place):
predictor_scope = fluid.Scope()
@@ -129,11 +126,10 @@ class VideoTag(hub.Module):
@runnable
def run_cmd(self, argsv):
args = self.parser.parse_args(argsv)
- results = self.classification(
- paths=[args.input_path], use_gpu=args.use_gpu)
+ results = self.classify(paths=[args.input_path], use_gpu=args.use_gpu)
return results
- def classification(self, paths, use_gpu=False, threshold=0.5, top_k=10):
+ def classify(self, paths, use_gpu=False, threshold=0.5, top_k=10):
"""
API of Classification.
@@ -169,15 +165,20 @@ class VideoTag(hub.Module):
self.place = fluid.CUDAPlace(
0) if args.use_gpu else fluid.CPUPlace()
self.exe = fluid.Executor(self.place)
- self.extractor_reader, self.extractor_main_prog, self.extractor_fetch_list, self.extractor_feeder, self.extractor_scope = self._extractor(
+ self.extractor_main_prog, self.extractor_fetch_list, self.extractor_feeder, self.extractor_scope = self._extractor(
args, self.exe, self.place)
self.predictor_main_prog, self.predictor_fetch_list, self.predictor_feeder, self.predictor_scope = self._predictor(
args, self.exe, self.place)
self._has_load = True
+ extractor_config = parse_config(args.extractor_config)
+ extractor_infer_config = merge_configs(extractor_config, 'infer',
+ vars(args))
+ extractor_reader = get_reader("TSN", 'infer', extractor_infer_config)
feature_list = []
file_list = []
- for idx, data in enumerate(self.extractor_reader()):
+
+ for idx, data in enumerate(extractor_reader()):
file_id = [item[-1] for item in data]
feed_data = [item[:-1] for item in data]
feature_out = self.exe.run(
diff --git a/hub_module/modules/video/classification/videotag_tsn_lstm/resource/reader/kinetics_reader.py b/hub_module/modules/video/classification/videotag_tsn_lstm/resource/reader/kinetics_reader.py
index ccf59ea26a6bf30be063941dd27f360c9f487e77..2c42da5c4d3216bd748cb35e6fae45e52493aa7e 100644
--- a/hub_module/modules/video/classification/videotag_tsn_lstm/resource/reader/kinetics_reader.py
+++ b/hub_module/modules/video/classification/videotag_tsn_lstm/resource/reader/kinetics_reader.py
@@ -87,38 +87,15 @@ class KineticsReader(DataReader):
def _batch_reader():
batch_out = []
for imgs, label in _reader():
- #for imgs in _reader():
if imgs is None:
continue
batch_out.append((imgs, label))
- #batch_out.append((imgs,))
if len(batch_out) == self.batch_size:
yield batch_out
batch_out = []
return _batch_reader
- def _inference_reader_creator(self, video_path, mode, seg_num, seglen,
- short_size, target_size, img_mean, img_std):
- def reader():
- try:
- imgs = mp4_loader(video_path, seg_num, seglen, mode)
- if len(imgs) < 1:
- logger.error('{} frame length {} less than 1.'.format(
- video_path, len(imgs)))
- yield None, None
- except:
- logger.error('Error when loading {}'.format(video_path))
- yield None, None
-
- imgs_ret = imgs_transform(imgs, mode, seg_num, seglen, short_size,
- target_size, img_mean, img_std)
- label_ret = video_path
-
- yield imgs_ret, label_ret
-
- return reader
-
def _reader_creator(self,
pickle_list,
mode,
@@ -149,37 +126,7 @@ class KineticsReader(DataReader):
return imgs_transform(imgs, mode, seg_num, seglen, \
short_size, target_size, img_mean, img_std, name = self.name), mp4_path
- def decode_pickle(sample, mode, seg_num, seglen, short_size,
- target_size, img_mean, img_std):
- pickle_path = sample[0]
- try:
- if python_ver < (3, 0):
- data_loaded = pickle.load(open(pickle_path, 'rb'))
- else:
- data_loaded = pickle.load(
- open(pickle_path, 'rb'), encoding='bytes')
-
- vid, label, frames = data_loaded
- if len(frames) < 1:
- logger.error('{} frame length {} less than 1.'.format(
- pickle_path, len(frames)))
- return None, None
- except:
- logger.info('Error when loading {}'.format(pickle_path))
- return None, None
-
- if mode == 'train' or mode == 'valid' or mode == 'test':
- ret_label = label
- elif mode == 'infer':
- ret_label = vid
-
- imgs = video_loader(frames, seg_num, seglen, mode)
- return imgs_transform(imgs, mode, seg_num, seglen, \
- short_size, target_size, img_mean, img_std, name = self.name), ret_label
-
def reader():
- # with open(pickle_list) as flist:
- # lines = [line.strip() for line in flist]
lines = [line.strip() for line in pickle_list]
if shuffle:
random.shuffle(lines)
@@ -187,15 +134,8 @@ class KineticsReader(DataReader):
pickle_path = line.strip()
yield [pickle_path]
- if format == 'pkl':
- decode_func = decode_pickle
- elif format == 'mp4':
- decode_func = decode_mp4
- else:
- raise "Not implemented format {}".format(format)
-
mapper = functools.partial(
- decode_func,
+ decode_mp4,
mode=mode,
seg_num=seg_num,
seglen=seglen,
@@ -218,142 +158,26 @@ def imgs_transform(imgs,
name=''):
imgs = group_scale(imgs, short_size)
- if mode == 'train':
- if name == "TSM":
- imgs = group_multi_scale_crop(imgs, short_size)
- imgs = group_random_crop(imgs, target_size)
- imgs = group_random_flip(imgs)
- else:
- imgs = group_center_crop(imgs, target_size)
-
- np_imgs = (np.array(imgs[0]).astype('float32').transpose(
- (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
- for i in range(len(imgs) - 1):
- img = (np.array(imgs[i + 1]).astype('float32').transpose(
- (2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
- np_imgs = np.concatenate((np_imgs, img))
- imgs = np_imgs
- imgs -= img_mean
- imgs /= img_std
- imgs = np.reshape(imgs, (seg_num, seglen * 3, target_size, target_size))
-
- return imgs
-
-def group_multi_scale_crop(img_group, target_size, scales=None, \
- max_distort=1, fix_crop=True, more_fix_crop=True):
- scales = scales if scales is not None else [1, .875, .75, .66]
- input_size = [target_size, target_size]
-
- im_size = img_group[0].size
-
- # get random crop offset
- def _sample_crop_size(im_size):
- image_w, image_h = im_size[0], im_size[1]
-
- base_size = min(image_w, image_h)
- crop_sizes = [int(base_size * x) for x in scales]
- crop_h = [
- input_size[1] if abs(x - input_size[1]) < 3 else x
- for x in crop_sizes
- ]
- crop_w = [
- input_size[0] if abs(x - input_size[0]) < 3 else x
- for x in crop_sizes
- ]
-
- pairs = []
- for i, h in enumerate(crop_h):
- for j, w in enumerate(crop_w):
- if abs(i - j) <= max_distort:
- pairs.append((w, h))
-
- crop_pair = random.choice(pairs)
- if not fix_crop:
- w_offset = random.randint(0, image_w - crop_pair[0])
- h_offset = random.randint(0, image_h - crop_pair[1])
- else:
- w_step = (image_w - crop_pair[0]) / 4
- h_step = (image_h - crop_pair[1]) / 4
-
- ret = list()
- ret.append((0, 0)) # upper left
- if w_step != 0:
- ret.append((4 * w_step, 0)) # upper right
- if h_step != 0:
- ret.append((0, 4 * h_step)) # lower left
- if h_step != 0 and w_step != 0:
- ret.append((4 * w_step, 4 * h_step)) # lower right
- if h_step != 0 or w_step != 0:
- ret.append((2 * w_step, 2 * h_step)) # center
-
- if more_fix_crop:
- ret.append((0, 2 * h_step)) # center left
- ret.append((4 * w_step, 2 * h_step)) # center right
- ret.append((2 * w_step, 4 * h_step)) # lower center
- ret.append((2 * w_step, 0 * h_step)) # upper center
-
- ret.append((1 * w_step, 1 * h_step)) # upper left quarter
- ret.append((3 * w_step, 1 * h_step)) # upper right quarter
- ret.append((1 * w_step, 3 * h_step)) # lower left quarter
- ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
-
- w_offset, h_offset = random.choice(ret)
+ np_imgs = np.array([np.array(img).astype('float32') for img in imgs]) #dhwc
+ np_imgs = group_center_crop(np_imgs, target_size)
+ np_imgs = np_imgs.transpose(0, 3, 1, 2) / 255 #dchw
+ np_imgs -= img_mean
+ np_imgs /= img_std
- return crop_pair[0], crop_pair[1], w_offset, h_offset
+ return np_imgs
- crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)
- crop_img_group = [
- img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
- for img in img_group
- ]
- ret_img_group = [
- img.resize((input_size[0], input_size[1]), Image.BILINEAR)
- for img in crop_img_group
- ]
- return ret_img_group
-
-
-def group_random_crop(img_group, target_size):
- w, h = img_group[0].size
+def group_center_crop(np_imgs, target_size):
+ d, h, w, c = np_imgs.shape
th, tw = target_size, target_size
-
assert (w >= target_size) and (h >= target_size), \
- "image width({}) and height({}) should be larger than crop size".format(w, h, target_size)
-
- out_images = []
- x1 = random.randint(0, w - tw)
- y1 = random.randint(0, h - th)
+ "image width({}) and height({}) should be larger than crop size".format(w, h, target_size)
- for img in img_group:
- if w == tw and h == th:
- out_images.append(img)
- else:
- out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
-
- return out_images
-
-
-def group_random_flip(img_group):
- v = random.random()
- if v < 0.5:
- ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
- return ret
- else:
- return img_group
-
-
-def group_center_crop(img_group, target_size):
- img_crop = []
- for img in img_group:
- w, h = img.size
- th, tw = target_size, target_size
- assert (w >= target_size) and (h >= target_size), \
- "image width({}) and height({}) should be larger than crop size".format(w, h, target_size)
- x1 = int(round((w - tw) / 2.))
- y1 = int(round((h - th) / 2.))
- img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+ h_off = int(round((h - th) / 2.))
+ w_off = int(round((w - tw) / 2.))
+ img_crop = np_imgs[:, h_off:h_off + target_size, w_off:w_off +
+ target_size, :]
return img_crop
@@ -378,47 +202,6 @@ def group_scale(imgs, target_size):
return resized_imgs
-def imageloader(buf):
- if isinstance(buf, str):
- img = Image.open(StringIO(buf))
- else:
- img = Image.open(BytesIO(buf))
-
- return img.convert('RGB')
-
-
-def video_loader(frames, nsample, seglen, mode):
- videolen = len(frames)
- average_dur = int(videolen / nsample)
-
- imgs = []
- for i in range(nsample):
- idx = 0
- if mode == 'train':
- if average_dur >= seglen:
- idx = random.randint(0, average_dur - seglen)
- idx += i * average_dur
- elif average_dur >= 1:
- idx += i * average_dur
- else:
- idx = i
- else:
- if average_dur >= seglen:
- idx = (average_dur - seglen) // 2
- idx += i * average_dur
- elif average_dur >= 1:
- idx += i * average_dur
- else:
- idx = i
-
- for jj in range(idx, idx + seglen):
- imgbuf = frames[int(jj % videolen)]
- img = imageloader(imgbuf)
- imgs.append(img)
-
- return imgs
-
-
def mp4_loader(filepath, nsample, seglen, mode):
cap = cv2.VideoCapture(filepath)
videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -434,26 +217,16 @@ def mp4_loader(filepath, nsample, seglen, mode):
imgs = []
for i in range(nsample):
idx = 0
- if mode == 'train':
- if average_dur >= seglen:
- idx = random.randint(0, average_dur - seglen)
- idx += i * average_dur
- elif average_dur >= 1:
- idx += i * average_dur
- else:
- idx = i
+ if average_dur >= seglen:
+ idx = (average_dur - 1) // 2
+ idx += i * average_dur
+ elif average_dur >= 1:
+ idx += i * average_dur
else:
- if average_dur >= seglen:
- idx = (average_dur - 1) // 2
- idx += i * average_dur
- elif average_dur >= 1:
- idx += i * average_dur
- else:
- idx = i
+ idx = i
for jj in range(idx, idx + seglen):
imgbuf = sampledFrames[int(jj % len(sampledFrames))]
img = Image.fromarray(imgbuf, mode='RGB')
imgs.append(img)
-
return imgs