format data support multi output

02c7ef31 · Hui Zhang · f89f99fe · 02c7ef31 · 02c7ef31
隐藏空白更改
内联并排

Showing with 14 addition and 4 deletion

examples/ted_en_zh/t0/run.sh examples/ted_en_zh/t0/run.sh +2 -1

utils/format_data.py utils/format_data.py +12 -3

未找到文件。
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
@@ -2,6 +2,7 @@
 set -e
 source path.sh

+gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/transformer_joint_noam.yaml
@@ -21,7 +22,7 @@ fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
 fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -87,15 +87,24 @@ def main():
                tokens = text_feature.tokenize(line)
                tokenids = text_feature.featurize(line)
                output_json['output'].append({
-                    'name': 'traget1',
+                    'name': 'target1',
                    'shape': (len(tokenids), vocab_size),
                    'text': line,
                    'token': ' '.join(tokens),
                    'tokenid': ' '.join(map(str, tokenids)),
                })
            else:
-                # isinstance(line, list), multi target 
-                raise NotImplementedError("not support multi output now!")
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })

            # input
            line = line_json['feat']