gen_data.sh 8.3 KB
Newer Older
G
guosheng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#! /usr/bin/env bash

set -e

OUTPUT_DIR=$PWD/gen_data

###############################################################################
# change these variables for other WMT data
###############################################################################
OUTPUT_DIR_DATA="${OUTPUT_DIR}/wmt16_ende_data"
OUTPUT_DIR_BPE_DATA="${OUTPUT_DIR}/wmt16_ende_data_bpe"
LANG1="en"
LANG2="de"
G
guosheng 已提交
14
# each of TRAIN_DATA: data_url data_file_lang1 data_file_lang2
G
guosheng 已提交
15
TRAIN_DATA=(
G
guosheng 已提交
16 17 18 19 20 21
'http://www.statmt.org/europarl/v7/de-en.tgz'
'europarl-v7.de-en.en' 'europarl-v7.de-en.de'
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz'
'commoncrawl.de-en.en' 'commoncrawl.de-en.de'
'http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz'
'news-commentary-v11.de-en.en' 'news-commentary-v11.de-en.de'
G
guosheng 已提交
22
)
G
guosheng 已提交
23
# each of DEV_TEST_DATA: data_url data_file_lang1 data_file_lang2
G
guosheng 已提交
24
DEV_TEST_DATA=(
G
guosheng 已提交
25 26 27 28
'http://data.statmt.org/wmt16/translation-task/dev.tgz'
'newstest201[45]-deen-ref.en.sgm' 'newstest201[45]-deen-src.de.sgm'
'http://data.statmt.org/wmt16/translation-task/test.tgz'
'newstest2016-deen-ref.en.sgm' 'newstest2016-deen-src.de.sgm'
G
guosheng 已提交
29 30 31
)
###############################################################################

G
guosheng 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
###############################################################################
# change these variables for other WMT data
###############################################################################
# OUTPUT_DIR_DATA="${OUTPUT_DIR}/wmt14_enfr_data"
# OUTPUT_DIR_BPE_DATA="${OUTPUT_DIR}/wmt14_enfr_data_bpe"
# LANG1="en"
# LANG2="fr"
# # each of TRAIN_DATA: ata_url data_tgz data_file 
# TRAIN_DATA=(
# 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz'
# 'commoncrawl.fr-en.en' 'commoncrawl.fr-en.fr'
# 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz'
# 'training/europarl-v7.fr-en.en' 'training/europarl-v7.fr-en.fr'
# 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz'
# 'training/news-commentary-v9.fr-en.en' 'training/news-commentary-v9.fr-en.fr'
# 'http://www.statmt.org/wmt10/training-giga-fren.tar'
G
guosheng 已提交
48
# 'giga-fren.release2.fixed.en.*' 'giga-fren.release2.fixed.fr.*'
G
guosheng 已提交
49 50 51 52 53
# 'http://www.statmt.org/wmt13/training-parallel-un.tgz'
# 'un/undoc.2000.fr-en.en' 'un/undoc.2000.fr-en.fr'
# )
# # each of DEV_TEST_DATA: data_url data_tgz data_file_lang1 data_file_lang2
# DEV_TEST_DATA=(
G
guosheng 已提交
54 55 56 57
# 'http://data.statmt.org/wmt16/translation-task/dev.tgz'
# '.*/newstest201[45]-fren-ref.en.sgm' '.*/newstest201[45]-fren-src.fr.sgm'
# 'http://data.statmt.org/wmt16/translation-task/test.tgz'
# '.*/newstest2016-fren-ref.en.sgm' '.*/newstest2016-fren-src.fr.sgm'
G
guosheng 已提交
58 59 60
# )
###############################################################################

G
guosheng 已提交
61 62 63 64 65
mkdir -p $OUTPUT_DIR_DATA $OUTPUT_DIR_BPE_DATA

# Extract training data
for ((i=0;i<${#TRAIN_DATA[@]};i+=3)); do
  data_url=${TRAIN_DATA[i]}
G
guosheng 已提交
66 67 68 69
  data_tgz=${data_url##*/}  # training-parallel-commoncrawl.tgz
  data=${data_tgz%.*}  # training-parallel-commoncrawl
  data_lang1=${TRAIN_DATA[i+1]}
  data_lang2=${TRAIN_DATA[i+2]}
G
guosheng 已提交
70
  if [ ! -e ${OUTPUT_DIR_DATA}/${data_tgz} ]; then
G
guosheng 已提交
71
    echo "Download "${data_url}
G
guosheng 已提交
72 73 74 75
    wget -O ${OUTPUT_DIR_DATA}/${data_tgz} ${data_url}
  fi

  if [ ! -d ${OUTPUT_DIR_DATA}/${data} ]; then
G
guosheng 已提交
76
    echo "Extract "${data_tgz}
G
guosheng 已提交
77
    mkdir -p ${OUTPUT_DIR_DATA}/${data}
G
guosheng 已提交
78 79 80 81 82 83
    tar_type=${data_tgz:0-3}
    if [ ${tar_type} == "tar" ]; then
      tar -xvf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
    else
      tar -xvzf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
    fi
G
guosheng 已提交
84 85
  fi
  # concatenate all training data
G
guosheng 已提交
86 87 88 89 90 91 92
  for data_lang in $data_lang1 $data_lang2; do
    for f in `find ${OUTPUT_DIR_DATA}/${data} -regex ".*/${data_lang}"`; do
      data_dir=`dirname $f`
      data_file=`basename $f`
      f_base=${f%.*}
      f_ext=${f##*.}
      if [ $f_ext == "gz" ]; then
G
guosheng 已提交
93
        gunzip $f
G
guosheng 已提交
94
        l=${f_base##*.}
G
guosheng 已提交
95
        f_base=${f_base%.*}
G
guosheng 已提交
96 97 98 99
      else
        l=${f_ext}
      fi
      
G
guosheng 已提交
100
      if [ $i -eq 0 ]; then
G
guosheng 已提交
101
        cat ${f_base}.$l > ${OUTPUT_DIR_DATA}/train.$l
G
guosheng 已提交
102
      else
G
guosheng 已提交
103
        cat ${f_base}.$l >> ${OUTPUT_DIR_DATA}/train.$l
G
guosheng 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116
      fi
    done
  done
done

# Clone mosesdecoder
if [ ! -d ${OUTPUT_DIR}/mosesdecoder ]; then
  echo "Cloning moses for data processing"
  git clone https://github.com/moses-smt/mosesdecoder.git ${OUTPUT_DIR}/mosesdecoder
fi

# Extract develop and test data
dev_test_data=""
G
guosheng 已提交
117
for ((i=0;i<${#DEV_TEST_DATA[@]};i+=3)); do
G
guosheng 已提交
118
  data_url=${DEV_TEST_DATA[i]}
G
guosheng 已提交
119 120 121 122
  data_tgz=${data_url##*/}  # training-parallel-commoncrawl.tgz
  data=${data_tgz%.*}  # training-parallel-commoncrawl
  data_lang1=${DEV_TEST_DATA[i+1]}
  data_lang2=${DEV_TEST_DATA[i+2]}
G
guosheng 已提交
123
  if [ ! -e ${OUTPUT_DIR_DATA}/${data_tgz} ]; then
G
guosheng 已提交
124
    echo "Download "${data_url}
G
guosheng 已提交
125 126 127 128
    wget -O ${OUTPUT_DIR_DATA}/${data_tgz} ${data_url}
  fi

  if [ ! -d ${OUTPUT_DIR_DATA}/${data} ]; then
G
guosheng 已提交
129
    echo "Extract "${data_tgz}
G
guosheng 已提交
130
    mkdir -p ${OUTPUT_DIR_DATA}/${data}
G
guosheng 已提交
131 132 133 134 135 136
    tar_type=${data_tgz:0-3}
    if [ ${tar_type} == "tar" ]; then
      tar -xvf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
    else
      tar -xvzf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
    fi
G
guosheng 已提交
137 138 139
  fi

  for data_lang in $data_lang1 $data_lang2; do
G
guosheng 已提交
140 141 142 143 144 145
    for f in `find ${OUTPUT_DIR_DATA}/${data} -regex ".*/${data_lang}"`; do
      data_dir=`dirname $f`
      data_file=`basename $f`
      data_out=`echo ${data_file} | cut -d '-' -f 1`  # newstest2016
      l=`echo ${data_file} | cut -d '.' -f 2`  # en
      dev_test_data="${dev_test_data}\|${data_out}"  # to make regexp
G
guosheng 已提交
146
      if [ ! -e ${OUTPUT_DIR_DATA}/${data_out}.$l ]; then
G
guosheng 已提交
147
        ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \
G
guosheng 已提交
148
          < $f > ${OUTPUT_DIR_DATA}/${data_out}.$l
G
guosheng 已提交
149
      fi
G
guosheng 已提交
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    done
  done
done

# Tokenize data
for l in ${LANG1} ${LANG2}; do
  for f in `ls ${OUTPUT_DIR_DATA}/*.$l | grep "\(train${dev_test_data}\)\.$l$"`; do
    f_base=${f%.*}  # dir/train dir/newstest2016
    f_out=$f_base.tok.$l
    if [ ! -e $f_out ]; then
      echo "Tokenize "$f
      ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l $l -threads 8 < $f > $f_out
    fi
  done
done

# Clean data
for f in ${OUTPUT_DIR_DATA}/train.${LANG1} ${OUTPUT_DIR_DATA}/train.tok.${LANG1}; do
  f_base=${f%.*}  # dir/train dir/train.tok
  f_out=${f_base}.clean
  if [ ! -e $f_out.${LANG1} ] && [ ! -e $f_out.${LANG2} ]; then
    echo "Clean "${f_base}
    ${OUTPUT_DIR}/mosesdecoder/scripts/training/clean-corpus-n.perl $f_base ${LANG1} ${LANG2} ${f_out} 1 80
  fi
done

# Clone subword-nmt and generate BPE data
if [ ! -d ${OUTPUT_DIR}/subword-nmt ]; then
  git clone https://github.com/rsennrich/subword-nmt.git ${OUTPUT_DIR}/subword-nmt
fi

# Generate BPE data and vocabulary
for num_operations in 32000; do
  if [ ! -e ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations} ]; then
    echo "Learn BPE with ${num_operations} merge operations"
    cat ${OUTPUT_DIR_DATA}/train.tok.clean.${LANG1} ${OUTPUT_DIR_DATA}/train.tok.clean.${LANG2} | \
      ${OUTPUT_DIR}/subword-nmt/learn_bpe.py -s $num_operations > ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations}
  fi

  for l in ${LANG1} ${LANG2}; do
    for f in `ls ${OUTPUT_DIR_DATA}/*.$l | grep "\(train${dev_test_data}\)\.tok\(\.clean\)\?\.$l$"`; do
      f_base=${f%.*}  # dir/train.tok dir/train.tok.clean dir/newstest2016.tok
      f_base=${f_base##*/}  # train.tok train.tok.clean newstest2016.tok
      f_out=${OUTPUT_DIR_BPE_DATA}/${f_base}.bpe.${num_operations}.$l
      if [ ! -e $f_out ]; then
        echo "Apply BPE to "$f
        ${OUTPUT_DIR}/subword-nmt/apply_bpe.py -c ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations} < $f > $f_out
      fi
    done
  done

  if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations} ]; then
    echo "Create vocabulary for BPE data"
    cat ${OUTPUT_DIR_BPE_DATA}/train.tok.clean.bpe.${num_operations}.${LANG1} ${OUTPUT_DIR_BPE_DATA}/train.tok.clean.bpe.${num_operations}.${LANG2} | \
      ${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations}
  fi
done

# Adapt to the reader
for f in ${OUTPUT_DIR_BPE_DATA}/*.bpe.${num_operations}.${LANG1}; do
  f_base=${f%.*}  # dir/train.tok.clean.bpe.32000 dir/newstest2016.tok.bpe.32000
  f_out=${f_base}.${LANG1}-${LANG2}
  if [ ! -e $f_out ]; then
    paste -d '\t' $f_base.${LANG1} $f_base.${LANG2} > $f_out
  fi
done
if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab_all.bpe.${num_operations} ]; then
  sed '1i\<s>\n<e>\n<unk>' ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations} > ${OUTPUT_DIR_BPE_DATA}/vocab_all.bpe.${num_operations}
fi

echo "All done."