From 0561dd01b874c94525b42bd596bafb8020b547bc Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 24 Nov 2016 10:13:16 +0800 Subject: [PATCH] Update doc and proc_from_raw_data/get_data.sh --- .../data/proc_from_raw_data/get_data.sh | 20 ++++++++++--------- doc/demo/quick_start/index_en.md | 2 +- doc_cn/demo/quick_start/index.md | 4 ++-- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/demo/quick_start/data/proc_from_raw_data/get_data.sh b/demo/quick_start/data/proc_from_raw_data/get_data.sh index 9c3e9db248..cd85e26842 100755 --- a/demo/quick_start/data/proc_from_raw_data/get_data.sh +++ b/demo/quick_start/data/proc_from_raw_data/get_data.sh @@ -25,14 +25,17 @@ cd $DIR # Download data echo "Downloading Amazon Electronics reviews data..." # http://jmcauley.ucsd.edu/data/amazon/ -#wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz +wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz echo "Downloading mosesdecoder..." -#https://github.com/moses-smt/mosesdecoder -#wget https://github.com/moses-smt/mosesdecoder/archive/master.zip -#unzip master.zip -#rm master.zip -echo "Done." +# https://github.com/moses-smt/mosesdecoder +wget https://github.com/moses-smt/mosesdecoder/archive/master.zip +unzip master.zip +rm master.zip + +################## +# Preprocess data +echo "Preprocess data..." export LC_ALL=C UNAME_STR=`uname` @@ -42,12 +45,11 @@ else SHUF_PROG='gshuf' fi -# Start preprocess mkdir -p tmp python preprocess.py -i reviews_Electronics_5.json.gz # uniq and shuffle cd tmp -echo 'uniq and shuffle...' +echo 'Uniq and shuffle...' cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed @@ -74,4 +76,4 @@ echo 'test.txt' > test.list rm -rf tmp mv dict.txt dict_all.txt cat dict_all.txt | head -n 30001 > dict.txt -echo 'preprocess finished' +echo 'Done.' diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md index 01f6f8ef54..11d568b8f9 100644 --- a/doc/demo/quick_start/index_en.md +++ b/doc/demo/quick_start/index_en.md @@ -59,7 +59,7 @@ To build your text classification system, your code will need to perform five st ## Preprocess data into standardized format In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product. -`demo/quick_start` in the [source code](https://github.com/baidu/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`). +`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`). ```bash cd demo/quick_start diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md index 514b45a487..4a6e07ee1f 100644 --- a/doc_cn/demo/quick_start/index.md +++ b/doc_cn/demo/quick_start/index.md @@ -32,7 +32,7 @@ ## 数据格式准备(Data Preparation) 在本问题中,我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/), -将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/baidu/Paddle)的`demo/quick_start`里提供了下载已经预处理数据的脚本(如果想从最原始的数据处理,可以使用脚本 `./demo/quick_start/data/proc_from_raw_data/get_data.sh`)。 +将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/PaddlePaddle/Paddle)的`demo/quick_start`里提供了下载已经预处理数据的脚本(如果想从最原始的数据处理,可以使用脚本 `./demo/quick_start/data/proc_from_raw_data/get_data.sh`)。 ```bash cd demo/quick_start @@ -141,7 +141,7 @@ PyDataProvider2。 我们将以基本的逻辑回归网络作为起点,并逐渐展示更加深入的功能。更详细的网络配置 连接请参考Layer文档。 -所有配置在[源码](https://github.com/baidu/Paddle)`demo/quick_start`目录,首先列举逻辑回归网络。 +所有配置在[源码](https://github.com/PaddlePaddle/Paddle)`demo/quick_start`目录,首先列举逻辑回归网络。 ### 逻辑回归模型(Logistic Regression) -- GitLab