#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 1. size of pos : neg = 1:1.
# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
# 3. distinct train set and test set.

set -e

DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR

# Download data
echo "Downloading Amazon Electronics reviews data..."
# http://jmcauley.ucsd.edu/data/amazon/
wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
echo "Downloading mosesdecoder..."
# https://github.com/moses-smt/mosesdecoder
wget https://github.com/moses-smt/mosesdecoder/archive/master.zip

unzip master.zip
rm master.zip

##################
# Preprocess data 
echo "Preprocess data..."
export LC_ALL=C
UNAME_STR=`uname`

if [ ${UNAME_STR} == 'Linux' ]; then
  SHUF_PROG='shuf'
else
  SHUF_PROG='gshuf'
fi

mkdir -p tmp
python preprocess.py -i reviews_Electronics_5.json.gz
# uniq and shuffle
cd tmp
echo 'Uniq and shuffle...'
cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed

min_len=`sed -n '$=' neg.shuffed`
test_num=$((min_len/10))
if [ $test_num -gt 12500 ];then
 test_num=12500
fi
train_num=$((min_len-test_num))

head -n$train_num pos.shuffed >train.pos
head -n$train_num neg.shuffed >train.neg
tail -n$test_num pos.shuffed >test.pos
tail -n$test_num neg.shuffed >test.neg

cat train.pos train.neg | ${SHUF_PROG} >../train.txt
cat test.pos test.neg | ${SHUF_PROG} >../test.txt

cd -
echo 'train.txt' > train.list
echo 'test.txt' > test.list

# use 30k dict
rm -rf tmp
mv dict.txt dict_all.txt
cat dict_all.txt | head -n 30001 > dict.txt
echo 'Done.'