From 203b6e1939f85f1428c11856022170dc140105f0 Mon Sep 17 00:00:00 2001 From: huangjun12 <2399845970@qq.com> Date: Mon, 14 Oct 2019 20:10:46 +0800 Subject: [PATCH] add ETS model to release 1.6 (#3565) * add caption model ETS for PaddleVideo * modify dynamic rnn to static rnn, and add infer past * modify reader and configs for evaluation and validation * modify details of the ets net code * modify API of data and dataloader for ets net --- PaddleCV/PaddleVideo/configs/ets.yaml | 46 ++++ PaddleCV/PaddleVideo/eval.py | 14 +- .../metrics/ets_metrics/__init__.py | 0 .../metrics/ets_metrics/ets_metrics.py | 91 +++++++ PaddleCV/PaddleVideo/metrics/metrics_util.py | 40 +++ PaddleCV/PaddleVideo/models/__init__.py | 2 + PaddleCV/PaddleVideo/models/ets/__init__.py | 1 + PaddleCV/PaddleVideo/models/ets/ets.py | 192 ++++++++++++++ PaddleCV/PaddleVideo/models/ets/ets_net.py | 239 ++++++++++++++++++ PaddleCV/PaddleVideo/predict.py | 22 +- PaddleCV/PaddleVideo/reader/__init__.py | 2 + PaddleCV/PaddleVideo/reader/ets_reader.py | 170 +++++++++++++ PaddleCV/PaddleVideo/train.py | 10 +- 13 files changed, 820 insertions(+), 9 deletions(-) create mode 100644 PaddleCV/PaddleVideo/configs/ets.yaml create mode 100644 PaddleCV/PaddleVideo/metrics/ets_metrics/__init__.py create mode 100644 PaddleCV/PaddleVideo/metrics/ets_metrics/ets_metrics.py create mode 100644 PaddleCV/PaddleVideo/models/ets/__init__.py create mode 100644 PaddleCV/PaddleVideo/models/ets/ets.py create mode 100644 PaddleCV/PaddleVideo/models/ets/ets_net.py create mode 100644 PaddleCV/PaddleVideo/reader/ets_reader.py diff --git a/PaddleCV/PaddleVideo/configs/ets.yaml b/PaddleCV/PaddleVideo/configs/ets.yaml new file mode 100644 index 00000000..733f8d60 --- /dev/null +++ b/PaddleCV/PaddleVideo/configs/ets.yaml @@ -0,0 +1,46 @@ +MODEL: + name: "ETS" + feat_size: 2048 + fc_dim: 1024 + gru_hidden_dim: 512 + decoder_size: 512 + word_emb_dim: 512 + max_length: 80 + beam_size: 3 + START: "" + END: "" + UNK: "" + feat_path: './data/dataset/ets/data_dict' + dict_file: './data/dataset/ets/dict.txt' + +TRAIN: + epoch: 40 + batch_size: 256 + l2_weight_decay: 1e-4 + clip_norm: 5.0 + num_threads: 8 + buffer_size: 1024 + filelist: './data/dataset/ets/train.list' + + use_gpu: True + num_gpus: 4 + +VALID: + filelist: './data/dataset/ets/val.list' + batch_size: 256 + num_threads: 8 + buffer_size: 1024 + use_gpu: True + num_gpus: 4 + +TEST: + filelist: './data/dataset/ets/val.list' + batch_size: 1 + num_threads: 1 + buffer_size: 1024 + +INFER: + filelist: './data/dataset/ets/infer.list' + batch_size: 1 + num_threads: 1 + buffer_size: 1024 diff --git a/PaddleCV/PaddleVideo/eval.py b/PaddleCV/PaddleVideo/eval.py index 14ab4bb7..dfa3d2b5 100644 --- a/PaddleCV/PaddleVideo/eval.py +++ b/PaddleCV/PaddleVideo/eval.py @@ -92,6 +92,8 @@ def test(args): place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + if args.weights: assert os.path.exists( args.weights), "Given weight dir {} not exist.".format(args.weights) @@ -111,8 +113,16 @@ def test(args): epoch_period = [] for test_iter, data in enumerate(test_reader()): cur_time = time.time() - test_outs = exe.run(fetch_list=test_fetch_list, - feed=test_feeder.feed(data)) + if args.model_name == 'ETS': + feat_data = [items[:3] for items in data] + vinfo = [items[3:] for items in data] + test_outs = exe.run(fetch_list=test_fetch_list, + feed=test_feeder.feed(feat_data), + return_numpy=False) + test_outs += vinfo + else: + test_outs = exe.run(fetch_list=test_fetch_list, + feed=test_feeder.feed(data)) period = time.time() - cur_time epoch_period.append(period) test_metrics.accumulate(test_outs) diff --git a/PaddleCV/PaddleVideo/metrics/ets_metrics/__init__.py b/PaddleCV/PaddleVideo/metrics/ets_metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/PaddleCV/PaddleVideo/metrics/ets_metrics/ets_metrics.py b/PaddleCV/PaddleVideo/metrics/ets_metrics/ets_metrics.py new file mode 100644 index 00000000..35fbf66d --- /dev/null +++ b/PaddleCV/PaddleVideo/metrics/ets_metrics/ets_metrics.py @@ -0,0 +1,91 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and + +import numpy as np +import datetime +import logging +import json +import os + +from models.ctcn.ctcn_utils import BoxCoder + +logger = logging.getLogger(__name__) + + +class MetricsCalculator(): + def __init__(self, + name='ETS', + mode='train', + dict_file ='' + ): + self.name = name + self.mode = mode # 'train', 'valid', 'test', 'infer' + self.dict_file = dict_file + self.reset() + + def reset(self): + logger.info('Resetting {} metrics...'.format(self.mode)) + self.aggr_batch_size = 0 + if (self.mode == 'train') or (self.mode == 'valid'): + self.aggr_loss = 0.0 + elif (self.mode == 'test') or (self.mode == 'infer'): + self.result_dict = dict() + self.out_file = self.name + '_' + self.mode + '_res_' + '.json' + + def accumulate(self, fetch_list): + if self.mode == 'valid': + loss = fetch_list[0] + self.aggr_loss += np.mean(np.array(loss)) + elif (self.mode == 'test') or (self.mode == 'infer'): + seq_ids = fetch_list[0] + seq_scores = fetch_list[1] + vid = fetch_list[2][0] + stime = fetch_list[2][1] + etime = fetch_list[2][2] + + #get idx_to_word + self.idx_to_word = dict() + with open(self.dict_file, 'r') as f: + for i, line in enumerate(f): + self.idx_to_word[i] = line.strip().split()[0] + + for i in range(len(seq_ids.lod()[0]) - 1): + start = seq_ids.lod()[0][i] + end = seq_ids.lod()[0][i + 1] + for j in range(end - start)[:1]: + sub_start = seq_ids.lod()[1][start + j] + sub_end = seq_ids.lod()[1][start + j + 1] + sent = " ".join([self.idx_to_word[idx] + for idx in np.array(seq_ids)[sub_start:sub_end][1:-1]]) + if vid not in self.result_dict: + self.result_dict[vid] = [{'timestamp': [stime, etime], 'sentence': sent}] + else: + self.result_dict[vid].append({'timestamp': [stime, etime], 'sentence': sent}) + + def accumulate_infer_results(self, fetch_list): + # the same as test + pass + + def finalize_metrics(self, savedir): + self.filepath = os.path.join(savedir, self.out_file) + with open(self.filepath, 'w') as f: + f.write(json.dumps({'version': 'VERSION 1.0', 'results': self.result_dict, 'external_data': {}}, indent=2)) + logger.info('results has been saved into file: {}'.format(self.filepath)) + + + def finalize_infer_metrics(self, savedir): + # the same as test + pass + + def get_computed_metrics(self): + pass diff --git a/PaddleCV/PaddleVideo/metrics/metrics_util.py b/PaddleCV/PaddleVideo/metrics/metrics_util.py index 15311538..9ec25e2f 100644 --- a/PaddleCV/PaddleVideo/metrics/metrics_util.py +++ b/PaddleCV/PaddleVideo/metrics/metrics_util.py @@ -28,6 +28,7 @@ from metrics.detections import detection_metrics as detection_metrics from metrics.bmn_metrics import bmn_proposal_metrics as bmn_proposal_metrics from metrics.bsn_metrics import bsn_tem_metrics as bsn_tem_metrics from metrics.bsn_metrics import bsn_pem_metrics as bsn_pem_metrics +from metrics.ets_metrics import ets_metrics as ets_metrics logger = logging.getLogger(__name__) @@ -421,6 +422,44 @@ class BsnPemMetrics(Metrics): self.calculator.reset() +class ETSMetrics(Metrics): + def __init__(self, name, mode, cfg): + self.name = name + self.mode = mode + args = {} + args['dict_file'] = cfg.MODEL.dict_file + args['mode'] = mode + args['name'] = name + self.calculator = ets_metrics.MetricsCalculator(**args) + + + def calculate_and_log_out(self, fetch_list, info=''): + if (self.mode == 'train') or (self.mode == 'valid'): + loss = np.array(fetch_list[0]) + logger.info( + info + '\tLoss = {}'.format('%.08f' % np.mean(loss))) + elif self.mode == "test": + translation_ids = np.array(fetch_list[0]) + translation_scores = np.array(fetch_list[1]) + logger.info( + info + '\ttranslation_ids = {}, \ttranslation_scores = {}'.format( + '%.01f' % np.mean(translation_ids), '%.04f' % np.mean(translation_scores))) + + def accumulate(self, fetch_list): + self.calculator.accumulate(fetch_list) + + def finalize_and_log_out(self, info='', savedir='./'): + if self.mode == 'valid': + logger.info(info) + else: #test or infer + self.calculator.finalize_metrics(savedir) + if self.mode == 'test': + logger.info(info + 'please refer to metrics/ets_metrics/README.md to get accuracy') + + def reset(self): + self.calculator.reset() + + class MetricsZoo(object): def __init__(self): self.metrics_zoo = {} @@ -461,3 +500,4 @@ regist_metrics("CTCN", DetectionMetrics) regist_metrics("BMN", BmnMetrics) regist_metrics("BSNTEM", BsnTemMetrics) regist_metrics("BSNPEM", BsnPemMetrics) +regist_metrics("ETS", ETSMetrics) diff --git a/PaddleCV/PaddleVideo/models/__init__.py b/PaddleCV/PaddleVideo/models/__init__.py index a6a86606..a5e01826 100644 --- a/PaddleCV/PaddleVideo/models/__init__.py +++ b/PaddleCV/PaddleVideo/models/__init__.py @@ -10,6 +10,7 @@ from .ctcn import CTCN from .bmn import BMN from .bsn import BsnTem from .bsn import BsnPem +from .ets import ETS # regist models, sort by alphabet regist_model("AttentionCluster", AttentionCluster) @@ -23,3 +24,4 @@ regist_model("CTCN", CTCN) regist_model("BMN", BMN) regist_model("BsnTem", BsnTem) regist_model("BsnPem", BsnPem) +regist_model("ETS", ETS) diff --git a/PaddleCV/PaddleVideo/models/ets/__init__.py b/PaddleCV/PaddleVideo/models/ets/__init__.py new file mode 100644 index 00000000..e3b58cd8 --- /dev/null +++ b/PaddleCV/PaddleVideo/models/ets/__init__.py @@ -0,0 +1 @@ +from .ets import * diff --git a/PaddleCV/PaddleVideo/models/ets/ets.py b/PaddleCV/PaddleVideo/models/ets/ets.py new file mode 100644 index 00000000..41a012e2 --- /dev/null +++ b/PaddleCV/PaddleVideo/models/ets/ets.py @@ -0,0 +1,192 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle +import paddle.fluid as fluid +from paddle.fluid import ParamAttr +import numpy as np + +from ..model import ModelBase +from . import ets_net + +import logging +logger = logging.getLogger(__name__) + +__all__ = ["ETS"] + + +class ETS(ModelBase): + """ETS model""" + + def __init__(self, name, cfg, mode='train'): + super(ETS, self).__init__(name, cfg, mode=mode) + self.get_config() + + def get_config(self): + self.feat_size = self.get_config_from_sec('MODEL', 'feat_size') + self.fc_dim = self.get_config_from_sec('MODEL', 'fc_dim') + self.gru_hidden_dim = self.get_config_from_sec('MODEL', + 'gru_hidden_dim') + self.decoder_size = self.get_config_from_sec('MODEL', 'decoder_size') + self.word_emb_dim = self.get_config_from_sec('MODEL', 'word_emb_dim') + self.dict_file = self.get_config_from_sec('MODEL', 'dict_file') + self.max_length = self.get_config_from_sec('MODEL', 'max_length') + self.beam_size = self.get_config_from_sec('MODEL', 'beam_size') + + self.num_epochs = self.get_config_from_sec('train', 'epoch') + self.l2_weight_decay = self.get_config_from_sec('train', + 'l2_weight_decay') + self.clip_norm = self.get_config_from_sec('train', 'clip_norm') + + def build_input(self, use_dataloader=True): + feat_shape = [None, self.feat_size] + word_shape = [None, 1] + word_next_shape = [None, 1] + + # set init data to None + py_reader = None + feat = None + word = None + word_next = None + init_ids = None + init_scores = None + + self.use_dataloader = use_dataloader + feat = fluid.data( + name='feat', shape=feat_shape, dtype='float32', lod_level=1) + + feed_list = [] + feed_list.append(feat) + if (self.mode == 'train') or (self.mode == 'valid'): + word = fluid.data( + name='word', shape=word_shape, dtype='int64', lod_level=1) + word_next = fluid.data( + name='word_next', + shape=word_next_shape, + dtype='int64', + lod_level=1) + feed_list.append(word) + feed_list.append(word_next) + elif (self.mode == 'test') or (self.mode == 'infer'): + init_ids = fluid.data( + name="init_ids", shape=[None, 1], dtype="int64", lod_level=2) + init_scores = fluid.data( + name="init_scores", + shape=[None, 1], + dtype="float32", + lod_level=2) + else: + raise NotImplementedError('mode {} not implemented'.format( + self.mode)) + + if use_dataloader: + assert self.mode != 'infer', \ + 'dataloader is not recommendated when infer, please set use_dataloader to be false.' + self.dataloader = fluid.io.DataLoader.from_generator( + feed_list=feed_list, capacity=16, iterable=True) + + self.feature_input = [feat] + self.word = word + self.word_next = word_next + self.init_ids = init_ids + self.init_scores = init_scores + + def create_model_args(self): + cfg = {} + cfg['feat_size'] = self.feat_size + cfg['fc_dim'] = self.fc_dim + cfg['gru_hidden_dim'] = self.gru_hidden_dim + cfg['decoder_size'] = self.decoder_size + cfg['word_emb_dim'] = self.word_emb_dim + word_dict = dict() + with open(self.dict_file, 'r') as f: + for i, line in enumerate(f): + word_dict[line.strip().split()[0]] = i + dict_size = len(word_dict) + cfg['dict_size'] = dict_size + cfg['max_length'] = self.max_length + cfg['beam_size'] = self.beam_size + + return cfg + + def build_model(self): + cfg = self.create_model_args() + self.videomodel = ets_net.ETSNET( + feat_size=cfg['feat_size'], + fc_dim=cfg['fc_dim'], + gru_hidden_dim=cfg['gru_hidden_dim'], + decoder_size=cfg['decoder_size'], + word_emb_dim=cfg['word_emb_dim'], + dict_size=cfg['dict_size'], + max_length=cfg['max_length'], + beam_size=cfg['beam_size'], + mode=self.mode) + if (self.mode == 'train') or (self.mode == 'valid'): + prob = self.videomodel.net(self.feature_input[0], self.word) + self.network_outputs = [prob] + elif (self.mode == 'test') or (self.mode == 'infer'): + translation_ids, translation_scores = self.videomodel.net( + self.feature_input[0], self.init_ids, self.init_scores) + self.network_outputs = [translation_ids, translation_scores] + + def optimizer(self): + l2_weight_decay = self.l2_weight_decay + clip_norm = self.clip_norm + + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm)) + lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( + self.gru_hidden_dim, 1000) + optimizer = fluid.optimizer.Adam( + learning_rate=lr_decay, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=l2_weight_decay)) + + return optimizer + + def loss(self): + assert self.mode != 'infer', "invalid loss calculationg in infer mode" + self.loss_ = self.videomodel.loss(self.network_outputs[0], + self.word_next) + return self.loss_ + + def outputs(self): + return self.network_outputs + + def feeds(self): + if (self.mode == 'train') or (self.mode == 'valid'): + return self.feature_input + [self.word, self.word_next] + elif (self.mode == 'test') or (self.mode == 'infer'): + return self.feature_input + [self.init_ids, self.init_scores] + else: + raise NotImplementedError('mode {} not implemented'.format( + self.mode)) + + def fetches(self): + if (self.mode == 'train') or (self.mode == 'valid'): + losses = self.loss() + fetch_list = [item for item in losses] + elif (self.mode == 'test') or (self.mode == 'infer'): + preds = self.outputs() + fetch_list = [item for item in preds] + else: + raise NotImplementedError('mode {} not implemented'.format( + self.mode)) + return fetch_list + + def pretrain_info(self): + return (None, None) + + def weights_info(self): + pass diff --git a/PaddleCV/PaddleVideo/models/ets/ets_net.py b/PaddleCV/PaddleVideo/models/ets/ets_net.py new file mode 100644 index 00000000..974b140b --- /dev/null +++ b/PaddleCV/PaddleVideo/models/ets/ets_net.py @@ -0,0 +1,239 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle.fluid as fluid +from paddle.fluid import ParamAttr +import numpy as np + +DATATYPE = 'float32' + + +class ETSNET(object): + def __init__(self, + feat_size, + fc_dim, + gru_hidden_dim, + max_length, + beam_size, + decoder_size, + word_emb_dim, + dict_size, + mode='train'): + self.feat_size = feat_size + self.fc_dim = fc_dim + self.gru_hidden_dim = gru_hidden_dim + self.decoder_size = decoder_size + self.word_emb_dim = word_emb_dim + self.dict_size = dict_size + self.max_length = max_length + self.beam_size = beam_size + self.mode = mode + + def encoder(self, feat): + bias_attr = fluid.ParamAttr( + regularizer=fluid.regularizer.L2Decay(0.0), + initializer=fluid.initializer.NormalInitializer(scale=0.0)) + + input_fc = fluid.layers.fc(input=feat, + size=self.fc_dim, + act='tanh', + bias_attr=bias_attr) + gru_forward_fc = fluid.layers.fc(input=input_fc, + size=self.gru_hidden_dim * 3, + bias_attr=False) + gru_forward = fluid.layers.dynamic_gru( + input=gru_forward_fc, size=self.gru_hidden_dim, is_reverse=False) + gru_backward_fc = fluid.layers.fc(input=input_fc, + size=self.gru_hidden_dim * 3, + bias_attr=False) + gru_backward = fluid.layers.dynamic_gru( + input=gru_backward_fc, size=self.gru_hidden_dim, is_reverse=True) + encoded_sequence = fluid.layers.concat( + input=[gru_forward, gru_backward], axis=1) + gru_weights = fluid.layers.fc(input=encoded_sequence, + size=1, + act='sequence_softmax', + bias_attr=False) + gru_scaled = fluid.layers.elementwise_mul( + x=encoded_sequence, y=gru_weights, axis=0) + encoded_vector = fluid.layers.sequence_pool( + input=gru_scaled, pool_type='sum') + encoded_proj = fluid.layers.fc(input=encoded_sequence, + size=self.decoder_size, + bias_attr=False) + return encoded_sequence, encoded_vector, encoded_proj + + def cell(self, x, hidden, encoder_out, encoder_out_proj): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=self.decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + mixed_state = fluid.layers.elementwise_add(encoder_proj, + decoder_state_expand) + attention_weights = fluid.layers.fc(input=mixed_state, + size=1, + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + + return context + + context = simple_attention(encoder_out, encoder_out_proj, hidden) + out = fluid.layers.fc(input=[x, context], + size=self.decoder_size * 3, + bias_attr=False) + out = fluid.layers.gru_unit( + input=out, hidden=hidden, size=self.decoder_size * 3)[0] + return out, out + + def train_decoder(self, word, encoded_sequence, encoded_vector, + encoded_proj): + decoder_boot = fluid.layers.fc(input=encoded_vector, + size=self.decoder_size, + act='tanh', + bias_attr=False) + word_embedding = fluid.layers.embedding( + input=word, size=[self.dict_size, self.word_emb_dim]) + + pad_value = fluid.layers.assign(input=np.array([0.], dtype=np.float32)) + word_embedding, length = fluid.layers.sequence_pad(word_embedding, + pad_value) + word_embedding = fluid.layers.transpose(word_embedding, [1, 0, 2]) + + rnn = fluid.layers.StaticRNN() + with rnn.step(): + x = rnn.step_input(word_embedding) + pre_state = rnn.memory(init=decoder_boot) + out, current_state = self.cell(x, pre_state, encoded_sequence, + encoded_proj) + prob = fluid.layers.fc(input=out, + size=self.dict_size, + act='softmax') + + rnn.update_memory(pre_state, current_state) + rnn.step_output(prob) + + rnn_out = rnn() + rnn_out = fluid.layers.transpose(rnn_out, [1, 0, 2]) + + length = fluid.layers.reshape(length, [-1]) + rnn_out = fluid.layers.sequence_unpad(x=rnn_out, length=length) + + return rnn_out + + def infer_decoder(self, init_ids, init_scores, encoded_sequence, + encoded_vector, encoded_proj): + decoder_boot = fluid.layers.fc(input=encoded_vector, + size=self.decoder_size, + act='tanh', + bias_attr=False) + + max_len = fluid.layers.fill_constant( + shape=[1], dtype='int64', value=self.max_length) + counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True) + + # create and init arrays to save selected ids, scores and states for each step + ids_array = fluid.layers.array_write(init_ids, i=counter) + scores_array = fluid.layers.array_write(init_scores, i=counter) + state_array = fluid.layers.array_write(decoder_boot, i=counter) + + cond = fluid.layers.less_than(x=counter, y=max_len) + while_op = fluid.layers.While(cond=cond) + with while_op.block(): + pre_ids = fluid.layers.array_read(array=ids_array, i=counter) + pre_score = fluid.layers.array_read(array=scores_array, i=counter) + pre_state = fluid.layers.array_read(array=state_array, i=counter) + + pre_ids_emb = fluid.layers.embedding( + input=pre_ids, size=[self.dict_size, self.word_emb_dim]) + + out, current_state = self.cell(pre_ids_emb, pre_state, + encoded_sequence, encoded_proj) + prob = fluid.layers.fc(input=out, + size=self.dict_size, + act='softmax') + + # beam search + topk_scores, topk_indices = fluid.layers.topk( + prob, k=self.beam_size) + accu_scores = fluid.layers.elementwise_add( + x=fluid.layers.log(topk_scores), + y=fluid.layers.reshape( + pre_score, shape=[-1]), + axis=0) + accu_scores = fluid.layers.lod_reset(x=accu_scores, y=pre_ids) + selected_ids, selected_scores = fluid.layers.beam_search( + pre_ids, + pre_score, + topk_indices, + accu_scores, + self.beam_size, + end_id=1) + + fluid.layers.increment(x=counter, value=1, in_place=True) + # save selected ids and corresponding scores of each step + fluid.layers.array_write(selected_ids, array=ids_array, i=counter) + fluid.layers.array_write( + selected_scores, array=scores_array, i=counter) + # update rnn state by sequence_expand acting as gather + current_state = fluid.layers.sequence_expand(current_state, + selected_scores) + fluid.layers.array_write( + current_state, array=state_array, i=counter) + current_enc_seq = fluid.layers.sequence_expand(encoded_sequence, + selected_scores) + fluid.layers.assign(current_enc_seq, encoded_sequence) + current_enc_proj = fluid.layers.sequence_expand(encoded_proj, + selected_scores) + fluid.layers.assign(current_enc_proj, encoded_proj) + + # update conditional variable + length_cond = fluid.layers.less_than(x=counter, y=max_len) + finish_cond = fluid.layers.logical_not( + fluid.layers.is_empty(x=selected_ids)) + fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond) + + translation_ids, translation_scores = fluid.layers.beam_search_decode( + ids=ids_array, + scores=scores_array, + beam_size=self.beam_size, + end_id=1) + + return translation_ids, translation_scores + + def net(self, feat, *input_decoder): + encoded_sequence, encoded_vector, encoded_proj = self.encoder(feat) + if (self.mode == 'train') or (self.mode == 'valid'): + word, = input_decoder + prob = self.train_decoder(word, encoded_sequence, encoded_vector, + encoded_proj) + return prob + else: + init_ids, init_scores = input_decoder + translation_ids, translation_scores = self.infer_decoder( + init_ids, init_scores, encoded_sequence, encoded_vector, + encoded_proj) + return translation_ids, translation_scores + + def loss(self, prob, word_next): + cost = fluid.layers.cross_entropy(input=prob, label=word_next) + avg_cost = fluid.layers.mean(cost) + return [avg_cost] diff --git a/PaddleCV/PaddleVideo/predict.py b/PaddleCV/PaddleVideo/predict.py index 1e5067eb..65eed7c1 100644 --- a/PaddleCV/PaddleVideo/predict.py +++ b/PaddleCV/PaddleVideo/predict.py @@ -109,6 +109,8 @@ def infer(args): place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + filelist = args.filelist or infer_config.INFER.filelist filepath = args.video_path or infer_config.INFER.get('filepath', '') if filepath != '': @@ -137,16 +139,26 @@ def infer(args): periods = [] cur_time = time.time() for infer_iter, data in enumerate(infer_reader()): - data_feed_in = [items[:-1] for items in data] - video_id = [items[-1] for items in data] - infer_outs = exe.run(fetch_list=fetch_list, - feed=infer_feeder.feed(data_feed_in)) + if args.model_name == 'ETS': + data_feed_in = [items[:3] for items in data] + vinfo = [items[3:] for items in data] + video_id = [items[0] for items in vinfo] + infer_outs = exe.run(fetch_list=fetch_list, + feed=infer_feeder.feed(data_feed_in), + return_numpy=False) + infer_result_list = infer_outs + vinfo + else: + data_feed_in = [items[:-1] for items in data] + video_id = [items[-1] for items in data] + infer_outs = exe.run(fetch_list=fetch_list, + feed=infer_feeder.feed(data_feed_in)) + infer_result_list = [item for item in infer_outs] + [video_id] + prev_time = cur_time cur_time = time.time() period = cur_time - prev_time periods.append(period) - infer_result_list = [item for item in infer_outs] + [video_id] infer_metrics.accumulate(infer_result_list) if args.log_interval > 0 and infer_iter % args.log_interval == 0: diff --git a/PaddleCV/PaddleVideo/reader/__init__.py b/PaddleCV/PaddleVideo/reader/__init__.py index 5a441f4c..a697e4bd 100644 --- a/PaddleCV/PaddleVideo/reader/__init__.py +++ b/PaddleCV/PaddleVideo/reader/__init__.py @@ -6,6 +6,7 @@ from .ctcn_reader import CTCNReader from .bmn_reader import BMNReader from .bsn_reader import BSNVideoReader from .bsn_reader import BSNProposalReader +from .ets_reader import ETSReader # regist reader, sort by alphabet regist_reader("ATTENTIONCLUSTER", FeatureReader) @@ -19,3 +20,4 @@ regist_reader("CTCN", CTCNReader) regist_reader("BMN", BMNReader) regist_reader("BSNTEM", BSNVideoReader) regist_reader("BSNPEM", BSNProposalReader) +regist_reader("ETS", ETSReader) diff --git a/PaddleCV/PaddleVideo/reader/ets_reader.py b/PaddleCV/PaddleVideo/reader/ets_reader.py new file mode 100644 index 00000000..745887ee --- /dev/null +++ b/PaddleCV/PaddleVideo/reader/ets_reader.py @@ -0,0 +1,170 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import os +import random +import sys +import numpy as np +import functools +import paddle + +import logging +logger = logging.getLogger(__name__) + +import pickle + +from .reader_utils import DataReader + +python_ver = sys.version_info + + +class ETSReader(DataReader): + """ + Data reader for ETS model, which was stored as features extracted by prior networks + """ + + def __init__(self, name, mode, cfg): + self.name = name + self.mode = mode + + self.feat_path = cfg.MODEL.feat_path + self.dict_file = cfg.MODEL.dict_file + self.START = cfg.MODEL.START + self.END = cfg.MODEL.END + self.UNK = cfg.MODEL.UNK + + self.filelist = cfg[mode.upper()]['filelist'] + self.batch_size = cfg[mode.upper()]['batch_size'] + self.num_threads = cfg[mode.upper()]['num_threads'] + self.buffer_size = cfg[mode.upper()]['buffer_size'] + if (mode == 'test') or (mode == 'infer'): + self.num_threads = 1 # set num_threads as 1 for test and infer + + def load_file(self): + word_dict = dict() + with open(self.dict_file, 'r') as f: + for i, line in enumerate(f): + word_dict[line.strip().split()[0]] = i + return word_dict + + def create_reader(self): + """reader creator for ets model""" + if self.mode == 'infer': + return self.make_infer_reader() + else: + return self.make_multiprocess_reader() + + def make_infer_reader(self): + """reader for inference""" + + def reader(): + batch_out = [] + + with open(self.filelist) as f: + lines = f.readlines() + reader_list = [ + line.strip() for line in lines if line.strip() != '' + ] + + word_dict = self.load_file() + for line in reader_list: + vid, stime, etime, sentence = line.split('\t') + stime, etime = float(stime), float(etime) + + if python_ver < (3, 0): + datas = pickle.load( + open(os.path.join(self.feat_path, vid), 'rb')) + else: + datas = pickle.load( + open(os.path.join(self.feat_path, vid), 'rb'), + encoding='bytes') + + feat = datas[int(stime * 5):int(etime * 5 + 0.5), :] + init_ids = np.array([[0]], dtype='int64') + init_scores = np.array([[0.]], dtype='float32') + if feat.shape[0] == 0: + continue + + batch_out.append( + (feat, init_ids, init_scores, vid, stime, etime)) + if len(batch_out) == self.batch_size: + yield batch_out + batch_out = [] + + return reader + + def make_multiprocess_reader(self): + """multiprocess reader""" + + def process_data(sample): + vid, feat, stime, etime, sentence = sample + + if self.mode == 'train' or self.mode == 'valid': + word_ids = [ + word_dict.get(w, word_dict[self.UNK]) + for w in sentence.split() + ] + word_ids_next = word_ids + [word_dict[self.END]] + word_ids = [word_dict[self.START]] + word_ids + return feat, word_ids, word_ids_next + elif self.mode == 'test': + init_ids = np.array([[0]], dtype='int64') + init_scores = np.array([[0.]], dtype='float32') + return feat, init_ids, init_scores, vid, stime, etime + else: + raise NotImplementedError('mode {} not implemented'.format( + self.mode)) + + def make_reader(): + def reader(): + lines = open(self.filelist).readlines() + reader_list = [ + line.strip() for line in lines if line.strip() != '' + ] + if self.mode == 'train': + random.shuffle(reader_list) + for line in reader_list: + vid, stime, etime, sentence = line.split('\t') + stime, etime = float(stime), float(etime) + + if python_ver < (3, 0): + datas = pickle.load( + open(os.path.join(self.feat_path, vid), 'rb')) + else: + datas = pickle.load( + open(os.path.join(self.feat_path, vid), 'rb'), + encoding='bytes') + + feat = datas[int(stime * 5):int(etime * 5 + 0.5), :] + if feat.shape[0] == 0: + continue + + yield [vid, feat, stime, etime, sentence] + + mapper = functools.partial(process_data) + + return paddle.reader.xmap_readers(mapper, reader, self.num_threads, + self.buffer_size) + + def batch_reader(): + batch_out = [] + for out in _reader(): + batch_out.append(out) + if len(batch_out) == self.batch_size: + yield batch_out + batch_out = [] + + word_dict = self.load_file() + _reader = make_reader() + return batch_reader diff --git a/PaddleCV/PaddleVideo/train.py b/PaddleCV/PaddleVideo/train.py index f06e1885..d13efac8 100644 --- a/PaddleCV/PaddleVideo/train.py +++ b/PaddleCV/PaddleVideo/train.py @@ -173,12 +173,18 @@ def train(args): if args.model_name in ['CTCN']: build_strategy.enable_sequential_execution = True + exec_strategy = fluid.ExecutionStrategy() + compiled_train_prog = fluid.compiler.CompiledProgram( train_prog).with_data_parallel( - loss_name=train_loss.name, build_strategy=build_strategy) + loss_name=train_loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) compiled_valid_prog = fluid.compiler.CompiledProgram( valid_prog).with_data_parallel( - share_vars_from=compiled_train_prog, build_strategy=build_strategy) + share_vars_from=compiled_train_prog, + build_strategy=build_strategy, + exec_strategy=exec_strategy) # get reader bs_denominator = 1 -- GitLab