提交 4c2b3b6e 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #484 from PaddlePaddle/release/v0.9.0

Release v0.9.0
......@@ -13,8 +13,6 @@
# The document of clang-format is
# http://clang.llvm.org/docs/ClangFormat.html
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
#
# TODO(yuyang18): Add python and other language code style
---
Language: Cpp
BasedOnStyle: Google
......@@ -22,8 +20,9 @@ IndentWidth: 2
TabWidth: 2
ContinuationIndentWidth: 4
AccessModifierOffset: -2 # The private/protected/public has no indent in class
PointerAlignment: Left # int* p/int& p, not int *p/int &p
Standard: Cpp11
AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
BinPackArguments: false
...
......@@ -3,4 +3,8 @@ build/
*.user
.vscode
.idea
\ No newline at end of file
.idea
.project
.cproject
.pydevproject
Makefile
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: c25201a00e6b0514370501050cf2a8538ac12270
hooks:
- id: remove-crlf
- repo: https://github.com/reyoung/mirrors-yapf.git
sha: v0.13.2
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
# TODO(yuyang): trailing whitespace has some bugs on markdown
# files now, please not add it to pre-commit hook now
# - id: trailing-whitespace
#
# TODO(yuyang): debug-statements not fit for Paddle, because
# not all of our python code is runnable. Some are used for
# documenation
# - id: debug-statements
[style]
based_on_style = pep8
column_limit = 80
......@@ -2,9 +2,17 @@ language: cpp
cache: ccache
sudo: required
dist: trusty
os:
- linux
- osx
env:
- JOB=DOCS
- JOB=BUILD_AND_TEST
matrix:
exclude:
- os: osx
env: JOB=DOCS # Only generate documentation in linux
addons:
apt:
packages:
......@@ -27,9 +35,22 @@ addons:
- libgoogle-glog-dev
- libgflags-dev
- libgtest-dev
- curl
- lcov
- graphviz
- swig
before_install:
- pip install wheel protobuf sphinx breathe recommonmark
- sudo paddle/scripts/travis/before_install.sh
- |
if [ ${JOB} == "BUILD_AND_TEST" ]; then
if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
then
echo "Only markdown docs were updated, stopping build process."
exit
fi
fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
script:
- paddle/scripts/travis/main.sh
notifications:
......
......@@ -2,14 +2,14 @@ cmake_minimum_required(VERSION 2.8)
project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0)
set(PADDLE_MINOR_VERSION 8)
set(PADDLE_PATCH_VERSION 0b1)
set(PADDLE_MINOR_VERSION 9)
set(PADDLE_PATCH_VERSION 0)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
include(package)
include(swig)
find_package(SWIG 2.0)
find_package(CUDA QUIET)
find_package(Protobuf REQUIRED)
find_package(PythonLibs 2.7 REQUIRED)
......@@ -40,6 +40,9 @@ option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
option(ON_TRAVIS "Running test on travis-ci or not." OFF)
option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
......@@ -49,11 +52,16 @@ endif()
include(enableCXX11)
include(cpplint)
include(ccache)
if(WITH_RDMA)
include(rdma)
endif()
include(util)
include(flags)
include(cudnn)
include(FindPythonModule)
include(check_packages)
include(swig)
include(coveralls)
# add PaddlePaddle version
if(DEFINED ENV{PADDLE_VERSION})
......@@ -65,6 +73,19 @@ else()
Subversion_WC_INFO(${PROJ_ROOT} Project)
add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
endif()
elseif(EXISTS ${PROJ_ROOT}/.git/)
find_package(Git REQUIRED)
execute_process(
COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
WORKING_DIRECTORY ${PROJ_ROOT}
OUTPUT_VARIABLE GIT_SHA1
RESULT_VARIABLE GIT_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT ${GIT_RESULT})
add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
else()
message(WARNING "Cannot add paddle version from git tag")
endif()
endif()
endif()
......@@ -74,11 +95,24 @@ if(NOT WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
if(${CUDA_VERSION_MAJOR} GREATER 6)
if(COMPILER_SUPPORT_CXX11)
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
endif()
endif()
# TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
if(NOT CUDNN_FOUND)
message(FATAL_ERROR "Paddle need cudnn to compile")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
if(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
else(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
endif(WITH_AVX)
if(WITH_DSO)
set(CUDA_LIBRARIES "")
......@@ -91,7 +125,7 @@ else()
endif(NOT WITH_GPU)
if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
set(ACCURACY double)
else(WITH_DOUBLE)
set(ACCURACY float)
......@@ -102,11 +136,11 @@ if(NOT WITH_TIMER)
endif(NOT WITH_TIMER)
if(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
else(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
endif(WITH_AVX)
if(WITH_PYTHON)
......@@ -116,12 +150,15 @@ else(WITH_PYTHON)
add_definitions(-DPADDLE_NO_PYTHON)
endif(WITH_PYTHON)
if(NOT WITH_RDMA)
add_definitions(-DPADDLE_DISABLE_RDMA)
endif()
if(WITH_RDMA)
include_directories("${RDMA_INC_DIR}")
else(WITH_RDMA)
add_definitions(-DPADDLE_DISABLE_RDMA)
endif(WITH_RDMA)
if(WITH_GLOG)
add_definitions(-DPADDLE_USE_GLOG)
include_directories(${LIBGLOG_INCLUDE_DIR})
endif()
if(WITH_GFLAGS)
......
Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
Both Chinese and English issues are welcome.
It's hard to solve a problem when important details are missing.
Before submitting the issue, look over the following criteria before handing your request in.
- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
- [ ] Did you retrieve your issue from widespread search engines ?
- [ ] Is my description of the issue clear enough to reproduce this problem?
* If some errors occurred, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
* If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
- [ ] Is my description of the issue use the github markdown correctly?
* Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
* Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.
# PaddlePaddle
[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
Welcome to the PaddlePaddle GitHub.
The software will be released on Sept. 30 with full documentation and installation support.
[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
A pre-release version is available now for those who are eager to take a look.
Welcome to the PaddlePaddle GitHub.
PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
efficient, flexible and scalable deep learning platform, which is originally
developed by Baidu scientists and engineers for the purpose of applying deep
learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle.
## Features
- **Flexibility**
PaddlePaddle supports a wide range of neural network architectures and
optimization algorithms. It is easy to configure complex models such as
neural machine translation model with attention mechanism or complex memory
connection.
PaddlePaddle supports a wide range of neural network architectures and
optimization algorithms. It is easy to configure complex models such as
neural machine translation model with attention mechanism or complex memory
connection.
- **Efficiency**
In order to unleash the power of heterogeneous computing resource,
optimization occurs at different levels of PaddlePaddle, including
computing, memory, architecture and communication. The following are some
examples:
1. Optimized math operations through SSE/AVX intrinsics, BLAS libraries
(e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
2. Highly optimized recurrent networks which can handle **variable-length**
sequence without padding.
3. Optimized local and distributed training for models with high dimensional
sparse data.
In order to unleash the power of heterogeneous computing resource,
optimization occurs at different levels of PaddlePaddle, including
computing, memory, architecture and communication. The following are some
examples:
- Optimized math operations through SSE/AVX intrinsics, BLAS libraries
(e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
- Highly optimized recurrent networks which can handle **variable-length**
sequence without padding.
- Optimized local and distributed training for models with high dimensional
sparse data.
- **Scalability**
With PaddlePaddle, it is easy to use many CPUs/GPUs and machines to speed
up your training. PaddlePaddle can achieve high throughput and performance
via optimized communication.
With PaddlePaddle, it is easy to use many CPUs/GPUs and machines to speed
up your training. PaddlePaddle can achieve high throughput and performance
via optimized communication.
- **Connected to Products**
In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
PaddlePaddle has been deployed into products or service with a vast number
of users, including ad click-through rate (CTR) prediction, large-scale image
classification, optical character recognition(OCR), search ranking, computer
virus detection, recommendation, etc. It is widely utilized in products at
Baidu and it has achieved a significant impact. We hope you can also exploit
the capability of PaddlePaddle to make a huge impact for your product.
In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
PaddlePaddle has been deployed into products or service with a vast number
of users, including ad click-through rate (CTR) prediction, large-scale image
classification, optical character recognition(OCR), search ranking, computer
virus detection, recommendation, etc. It is widely utilized in products at
Baidu and it has achieved a significant impact. We hope you can also exploit
the capability of PaddlePaddle to make a huge impact for your product.
## Installation
See [Installation Guide](http://paddlepaddle.org/doc/build/) to install from pre-built package or build from the source code. (Note: The installation packages are still in pre-release state and your experience of installation may not be smooth.).
Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
pre-built packages (**docker image**, **deb package**) or
directly build on **Linux** and **Mac OS X** from the source code.
## Documentation
- [Chinese Documentation](http://paddlepaddle.org/doc_cn/) <br>
Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
You can follow the quick start tutorial to learn how use PaddlePaddle
......@@ -81,9 +88,9 @@ See [Installation Guide](http://paddlepaddle.org/doc/build/) to install from pre
- [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
## Ask Questions
If you want to ask questions and discuss about methods and models, welcome
to send email to paddle-dev@baidu.com. Framework development discussions and
Please join the [**gitter chat**](https://gitter.im/PaddlePaddle/Deep_Learning) or send email to
**paddle-dev@baidu.com** to ask questions and talk about methods and models.
Framework development discussions and
bug reports are collected on [Issues](https://github.com/baidu/paddle/issues).
## Copyright and License
......
# Release v0.9.0
## New Features:
* New Layers
* bilinear interpolation layer.
* spatial pyramid-pool layer.
* de-convolution layer.
* maxout layer.
* Support rectangle padding, stride, window and input for Pooling Operation.
* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
* Expose cost_weight/nce_layer in `trainer_config_helpers`
* Add FAQ, concepts, h-rnn docs.
* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
* Add usage track scripts.
## Improvements
* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
* Add code coverage tools.
* Refine convolution layer to speedup and reduce GPU memory.
* Speed up PyDataProvider2
* Add ubuntu deb package build scripts.
* Make Paddle use git-flow branching model.
* PServer support no parameter blocks.
## Bug Fixes
* add zlib link to py_paddle
* add input sparse data check for sparse layer at runtime
* Bug fix for sparse matrix multiplication
* Fix floating-point overflow problem of tanh
* Fix some nvcc compile options
* Fix a bug in yield dictionary in DataProvider
* Fix SRL hang when exit.
# Release v0.8.0beta.1
New features:
* Mac OSX is supported by source code. #138
* Both GPU and CPU versions of PaddlePaddle are supported.
* Support CUDA 8.0
* Enhance `PyDataProvider2`
* Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
* Add `min_pool_size` to control memory pool in provider.
* Add `deb` install package & docker image for no_avx machines.
* Especially for cloud computing and virtual machines
* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
* Add Parallel NN api in trainer_config_helpers.
* Add `travis ci` for Github
Bug fixes:
* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
* Check if PaddlePaddle is installed when unittest.
* Fix bugs in GTX series GPU
* Fix bug in MultinomialSampler
Also more documentation was written since last release.
# Release v0.8.0beta.0
PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
......@@ -3,36 +3,55 @@
INCLUDE(CheckCXXSourceRuns)
SET(FIND_AVX_10)
SET(FIND_AVX_20)
SET(AVX_FLAGS)
SET(AVX_FOUND)
# Check AVX 2
SET(CMAKE_REQUIRED_FLAGS)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET(CMAKE_REQUIRED_FLAGS "-mavx2")
ELSEIF(MSVC AND NOT CMAKE_CL_64) # reserve for WINDOWS
SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
set(MMX_FLAG "-mmmx")
set(SSE2_FLAG "-msse2")
set(SSE3_FLAG "-msse3")
SET(AVX_FLAG "-mavx")
SET(AVX2_FLAG "-mavx2")
ELSEIF(MSVC)
set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2")
set(SSE3_FLAG "/arch:SSE3")
SET(AVX_FLAG "/arch:AVX")
SET(AVX2_FLAG "/arch:AVX2")
ENDIF()
# Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
#include <mmintrin.h>
int main()
{
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a);
_mm_setzero_si64();
return 0;
}" FIND_AVX_20)
}" MMX_FOUND)
# Check AVX
SET(CMAKE_REQUIRED_FLAGS)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET(CMAKE_REQUIRED_FLAGS "-mavx")
ELSEIF(MSVC AND NOT CMAKE_CL_64)
SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
endif()
# Check SSE2
set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
CHECK_CXX_SOURCE_RUNS("
#include <emmintrin.h>
int main()
{
_mm_setzero_si128();
return 0;
}" SSE2_FOUND)
# Check SSE3
set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
CHECK_CXX_SOURCE_RUNS("
#include <pmmintrin.h>
int main()
{
__m128d a = _mm_set1_pd(6.28);
__m128d b = _mm_set1_pd(3.14);
__m128d result = _mm_addsub_pd(a, b);
result = _mm_movedup_pd(result);
return 0;
}" SSE3_FOUND)
# Check AVX
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
......@@ -41,25 +60,17 @@ int main()
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 result = _mm256_add_ps (a, b);
return 0;
}" FIND_AVX_10)
IF(${FIND_AVX_20})
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
ELSEIF(MSVC)
SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
ENDIF()
ENDIF()
}" AVX_FOUND)
IF(${FIND_AVX_10})
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
ELSEIF(MSVC)
SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
ENDIF()
ENDIF()
# Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a);
return 0;
}" AVX2_FOUND)
IF("${FIND_AVX_10}" OR "${FIND_AVX_20}")
SET(AVX_FOUND TRUE)
MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
ENDIF()
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
# Find the CBlas libraries
# Find the CBlas and lapack libraries
#
# It will search MKL, atlas, OpenBlas, reference-cblas in order.
#
......@@ -17,10 +17,19 @@
## Find MKL First.
set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
find_path(MKL_INCLUDE_DIR mkl.h PATHS
${MKL_ROOT}/include)
find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
......@@ -30,6 +39,7 @@ if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
${MKL_SEQUENTIAL_LIB}
${MKL_CORE_LIB})
add_definitions(-DPADDLE_USE_MKL)
message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
return() # return file.
endif()
......@@ -48,15 +58,19 @@ set(ATLAS_LIB_SEARCH_PATHS
)
find_path(ATLAS_INC_DIR NAMES cblas.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
find_library(ATLAS_LIB NAMES atlas libatlas.so.3
find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS ${ATLAS_LIB_SEARCH_PATHS})
if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
set(CBLAS_PROVIDER ATLAS)
set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
add_definitions(-DPADDLE_USE_ATLAS)
message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
return()
endif()
......@@ -76,6 +90,8 @@ set(OPENBLAS_LIB_SEARCH_PATHS
find_path(OPENBLAS_INC_DIR NAMES cblas.h
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
find_library(OPENBLAS_LIB NAMES openblas
PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
......@@ -83,6 +99,7 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
set(CBLAS_PROVIDER OPENBLAS)
set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
set(CBLAS_LIBS ${OPENBLAS_LIB})
message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
return()
endif()
......
# CMake script for code coverage.
# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
# Param _COVERAGE_SRCS A list of coverage source files.
# Param _COVERALLS_UPLOAD Upload the result to coveralls.
# Param _CMAKE_SCRIPT_PATH CMake script path.
function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
# clean previous gcov data.
file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
# find curl for upload JSON soon.
if (_COVERALLS_UPLOAD)
find_program(CURL_EXECUTABLE curl)
if (NOT CURL_EXECUTABLE)
message(FATAL_ERROR "Coveralls: curl not found!")
endif()
endif()
# When passing a CMake list to an external process, the list
# will be converted from the format "1;2;3" to "1 2 3".
set(COVERAGE_SRCS "")
foreach (SINGLE_SRC ${_COVERAGE_SRCS})
set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
endforeach()
# query number of logical cores
cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
# coveralls json file.
set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
add_custom_target(coveralls_generate
# Run regress tests.
COMMAND ${CMAKE_CTEST_COMMAND}
-j ${core_size}
--output-on-failure
# Generate Gcov and translate it into coveralls JSON.
COMMAND ${CMAKE_COMMAND}
-DCOVERAGE_SRCS="${COVERAGE_SRCS}"
-DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
-DCOV_PATH="${PROJECT_BINARY_DIR}"
-DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
-P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
COMMENT "Coveralls: generating coveralls output..."
)
if (_COVERALLS_UPLOAD)
message("COVERALLS UPLOAD: ON")
# Upload the JSON to coveralls.
add_custom_target(coveralls_upload
COMMAND ${CURL_EXECUTABLE}
-S -F json_file=@${COVERALLS_FILE}
https://coveralls.io/api/v1/jobs
DEPENDS coveralls_generate
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
COMMENT "Coveralls: uploading coveralls output...")
add_custom_target(coveralls DEPENDS coveralls_upload)
else()
message("COVERALLS UPLOAD: OFF")
add_custom_target(coveralls DEPENDS coveralls_generate)
endif()
endfunction()
if(ON_COVERALLS)
set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
set(EXCLUDE_DIRS
"demo/"
"build/"
"tests/"
".test_env/"
)
if(WITH_GPU)
file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
else()
file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
endif()
# exclude trivial files in PADDLE_SOURCES
foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
foreach(TMP_PATH ${PADDLE_SOURCES})
string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
endif()
endforeach(TMP_PATH)
endforeach()
# convert to absolute path
set(PADDLE_SRCS "")
foreach(PADDLE_SRC ${PADDLE_SOURCES})
set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
endforeach()
code_coverage(
"${PADDLE_SRCS}"
${COVERALLS_UPLOAD}
"${PROJECT_SOURCE_DIR}/cmake"
)
endif()
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# Copyright (C) 2014 Joakim Söderberg <joakim.soderberg@gmail.com>
#
# This is intended to be run by a custom target in a CMake project like this.
# 0. Compile program with coverage support.
# 1. Clear coverage data. (Recursively delete *.gcda in build dir)
# 2. Run the unit tests.
# 3. Run this script specifying which source files the coverage should be performed on.
#
# This script will then use gcov to generate .gcov files in the directory specified
# via the COV_PATH var. This should probably be the same as your cmake build dir.
#
# It then parses the .gcov files to convert them into the Coveralls JSON format:
# https://coveralls.io/docs/api
#
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
# Since it's not possible to pass a CMake list properly in the
# "1;2;3" format to an external process, we have replaced the
# ";" with "*", so reverse that here so we get it back into the
# CMake list format.
string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS})
find_program(GCOV_EXECUTABLE gcov)
if (NOT GCOV_EXECUTABLE)
message(FATAL_ERROR "gcov not found! Aborting...")
endif()
find_package(Git)
# TODO: Add these git things to the coveralls json.
if (GIT_FOUND)
# Branch.
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_BRANCH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
macro (git_log_format FORMAT_CHARS VAR_NAME)
execute_process(
COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE ${VAR_NAME}
OUTPUT_STRIP_TRAILING_WHITESPACE
)
endmacro()
git_log_format(an GIT_AUTHOR_EMAIL)
git_log_format(ae GIT_AUTHOR_EMAIL)
git_log_format(cn GIT_COMMITTER_NAME)
git_log_format(ce GIT_COMMITTER_EMAIL)
git_log_format(B GIT_COMMIT_MESSAGE)
message("Git exe: ${GIT_EXECUTABLE}")
message("Git branch: ${GIT_BRANCH}")
message("Git author: ${GIT_AUTHOR_NAME}")
message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
message("Git commiter name: ${GIT_COMMITTER_NAME}")
message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
message("Git commit message: ${GIT_COMMIT_MESSAGE}")
endif()
############################# Macros #########################################
#
# This macro converts from the full path format gcov outputs:
#
# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
#
# to the original source file path the .gcov is for:
#
# /path/to/project/root/subdir/the_file.c
#
macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME)
# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
# ->
# #path#to#project#root#subdir#the_file.c.gcov
get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
# #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
endmacro()
##############################################################################
# Get the coverage data.
file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
message("GCDA files:")
# Get a list of all the object directories needed by gcov
# (The directories the .gcda files and .o files are found in)
# and run gcov on those.
foreach(GCDA ${GCDA_FILES})
message("Process: ${GCDA}")
message("------------------------------------------------------------------------------")
get_filename_component(GCDA_DIR ${GCDA} PATH)
#
# The -p below refers to "Preserve path components",
# This means that the generated gcov filename of a source file will
# keep the original files entire filepath, but / is replaced with #.
# Example:
#
# /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
# ------------------------------------------------------------------------------
# File '/path/to/project/root/subdir/the_file.c'
# Lines executed:68.34% of 199
# /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
#
# If -p is not specified then the file is named only "the_file.c.gcov"
#
execute_process(
COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
WORKING_DIRECTORY ${GCDA_DIR}
)
endforeach()
# TODO: Make these be absolute path
file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
# Get only the filenames to use for filtering.
#set(COVERAGE_SRCS_NAMES "")
#foreach (COVSRC ${COVERAGE_SRCS})
# get_filename_component(COVSRC_NAME ${COVSRC} NAME)
# message("${COVSRC} -> ${COVSRC_NAME}")
# list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
#endforeach()
#
# Filter out all but the gcov files we want.
#
# We do this by comparing the list of COVERAGE_SRCS filepaths that the
# user wants the coverage data for with the paths of the generated .gcov files,
# so that we only keep the relevant gcov files.
#
# Example:
# COVERAGE_SRCS =
# /path/to/project/root/subdir/the_file.c
#
# ALL_GCOV_FILES =
# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
# /path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
#
# Result should be:
# GCOV_FILES =
# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
#
set(GCOV_FILES "")
#message("Look in coverage sources: ${COVERAGE_SRCS}")
message("\nFilter out unwanted GCOV files:")
message("===============================")
set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS})
foreach (GCOV_FILE ${ALL_GCOV_FILES})
#
# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
# ->
# /path/to/project/root/subdir/the_file.c
get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
# Is this in the list of source files?
# TODO: We want to match against relative path filenames from the source file root...
list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
if (NOT WAS_FOUND EQUAL -1)
message("YES: ${GCOV_FILE}")
list(APPEND GCOV_FILES ${GCOV_FILE})
# We remove it from the list, so we don't bother searching for it again.
# Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
# have coverage data generated from them (no lines are covered).
list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
else()
message("NO: ${GCOV_FILE}")
endif()
endforeach()
# TODO: Enable setting these
set(JSON_SERVICE_NAME "travis-ci")
set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID})
set(JSON_TEMPLATE
"{
\"service_name\": \"\@JSON_SERVICE_NAME\@\",
\"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\",
\"source_files\": \@JSON_GCOV_FILES\@
}"
)
set(SRC_FILE_TEMPLATE
"{
\"name\": \"\@GCOV_SRC_REL_PATH\@\",
\"source_digest\": \"\@GCOV_CONTENTS_MD5\@\",
\"coverage\": \@GCOV_FILE_COVERAGE\@
}"
)
message("\nGenerate JSON for files:")
message("=========================")
set(JSON_GCOV_FILES "[")
# Read the GCOV files line by line and get the coverage data.
foreach (GCOV_FILE ${GCOV_FILES})
get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
# The new coveralls API doesn't need the entire source (Yay!)
# However, still keeping that part for now. Will cleanup in the future.
file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
# Loads the gcov file as a list of lines.
# (We first open the file and replace all occurences of [] with _
# because CMake will fail to parse a line containing unmatched brackets...
# also the \ to escaped \n in macros screws up things.)
# https://public.kitware.com/Bug/view.php?id=15369
file(READ ${GCOV_FILE} GCOV_CONTENTS)
string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
list(LENGTH GCOV_LINES LINE_COUNT)
# Instead of trying to parse the source from the
# gcov file, simply read the file contents from the source file.
# (Parsing it from the gcov is hard because C-code uses ; in many places
# which also happens to be the same as the CMake list delimeter).
file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
# According to http://json.org/ these should be escaped as well.
# Don't know how to do that in CMake however...
#string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
#string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
#string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
# We want a json array of coverage data as a single string
# start building them from the contents of the .gcov
set(GCOV_FILE_COVERAGE "[")
set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
set(DO_SKIP 0)
foreach (GCOV_LINE ${GCOV_LINES})
#message("${GCOV_LINE}")
# Example of what we're parsing:
# Hitcount |Line | Source
# " 8: 26: if (!allowed || (strlen(allowed) == 0))"
string(REGEX REPLACE
"^([^:]*):([^:]*):(.*)$"
"\\1;\\2;\\3"
RES
"${GCOV_LINE}")
# Check if we should exclude lines using the Lcov syntax.
string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
set(RESET_SKIP 0)
if (LINE_SKIP AND NOT DO_SKIP)
set(DO_SKIP 1)
set(RESET_SKIP 1)
endif()
if (START_SKIP)
set(DO_SKIP 1)
message("${GCOV_LINE_COUNT}: Start skip")
endif()
if (END_SKIP)
set(DO_SKIP 0)
endif()
list(LENGTH RES RES_COUNT)
if (RES_COUNT GREATER 2)
list(GET RES 0 HITCOUNT)
list(GET RES 1 LINE)
list(GET RES 2 SOURCE)
string(STRIP ${HITCOUNT} HITCOUNT)
string(STRIP ${LINE} LINE)
# Lines with 0 line numbers are metadata and can be ignored.
if (NOT ${LINE} EQUAL 0)
if (DO_SKIP)
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
else()
# Translate the hitcount into valid JSON values.
if (${HITCOUNT} STREQUAL "#####")
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
elseif (${HITCOUNT} STREQUAL "-")
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
else()
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
endif()
endif()
endif()
else()
message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}")
endif()
if (RESET_SKIP)
set(DO_SKIP 0)
endif()
math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
endforeach()
message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
# Advanced way of removing the trailing comma in the JSON array.
# "[1, 2, 3, " -> "[1, 2, 3"
string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
# Append the trailing ] to complete the JSON array.
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
# Generate the final JSON for this file.
message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
endforeach()
# Loop through all files we couldn't find any coverage for
# as well, and generate JSON for those as well with 0% coverage.
foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
# Loads the source file as a list of lines.
file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
set(GCOV_FILE_COVERAGE "[")
set(GCOV_FILE_SOURCE "")
foreach (SOURCE ${SRC_LINES})
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
endforeach()
# Remove trailing comma, and complete JSON array with ]
string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
# Generate the final JSON for this file.
message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
endforeach()
# Get rid of trailing comma.
string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES})
set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]")
# Generate the final complete JSON!
message("Generate final JSON...")
string(CONFIGURE ${JSON_TEMPLATE} JSON)
file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}")
message("###########################################################################")
message("Generated coveralls JSON containing coverage data:")
message("${COVERALLS_OUTPUT_FILE}")
message("###########################################################################")
......@@ -21,12 +21,6 @@ function(safe_set_flag is_c src_list flag_name)
endif()
if(${safe_name})
set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
if(is_c)
set(CUDA_NVCC_FLAGS
--compiler-options;${flag_name}
${CUDA_NVCC_FLAGS}
PARENT_SCOPE)
endif()
endif()
endfunction()
......@@ -40,6 +34,20 @@ macro(safe_set_cxxflag src_list flag_name)
safe_set_flag(OFF ${src_list} ${flag_name})
endmacro()
# helper macro to set nvcc flag
macro(safe_set_nvflag flag_name)
string(REPLACE "-" "_" safe_name ${flag_name})
string(REPLACE "=" "_" safe_name ${safe_name})
CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
if(${safe_name})
set(CUDA_NVCC_FLAGS
--compiler-options;${flag_name}
${CUDA_NVCC_FLAGS})
endif()
endmacro()
CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
if(NOT UINT64_MAX_EXISTS)
set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
......@@ -63,14 +71,44 @@ set(COMMON_FLAGS
-Wnon-virtual-dtor
-Wdelete-non-virtual-dtor
-Wno-unused-parameter
-Wno-unused-function
-Wno-error=literal-suffix
-Wno-error=unused-local-typedefs)
set(GPU_COMMON_FLAGS
-fPIC
-fno-omit-frame-pointer
-Wnon-virtual-dtor
-Wdelete-non-virtual-dtor
-Wno-unused-parameter
-Wno-unused-function
-Wno-error=literal-suffix
-Wno-error=unused-local-typedefs
-Wno-error=unused-function # Warnings in Numpy Header.
)
if (APPLE)
# On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
else()
set(GPU_COMMON_FLAGS
-Wall
-Wextra
-Werror
${GPU_COMMON_FLAGS})
endif()
foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag})
safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
endforeach()
foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag})
endforeach()
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
......@@ -88,15 +126,15 @@ endfunction()
# Common gpu architectures: Kepler, Maxwell
foreach(capability 30 35 50)
list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
endforeach()
if (CUDA_VERSION VERSION_GREATER "7.0")
if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
endif()
# Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0")
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
endif()
......
# user should download rdma first from subversion repository
# execute following instruction to download svn mannally
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
function(generate_rdma_links)
#redirect to current DIR to isolate the pollution from system runtime environment
#it can benifits unified control for different gcc environment.
#e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
#runtime libraries that will crash process while loading it. That redirect trick
#can fix it.
execute_process(
COMMAND mkdir -p librdma
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endfunction(generate_rdma_links)
#check and set headers
find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
#check and set libs
find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
if(
RDMA_INC_SXISOCK AND
RDMA_INC_XIO AND
RDMA_INC_EVENT AND
RDMA_INC_NUMA AND
RDMA_LIB_SXISOCK AND
RDMA_LIB_XIO AND
RDMA_LIB_EVENT AND
RDMA_LIB_EVENT_CORE AND
RDMA_LIB_EVENT_EXTRA AND
RDMA_LIB_EVENT_PTHREADS AND
RDMA_LIB_NUMA
)
set(RDMA_INC_DIR
${RDMA_INC_SXISOCK}
${RDMA_INC_XIO}
${RDMA_INC_EVENT}
${RDMA_INC_NUMA})
set(RDMA_LIBS
${RDMA_LIB_SXISOCK}
${RDMA_LIB_XIO}
${RDMA_LIB_EVENT}
${RDMA_LIB_EVENT_CORE}
${RDMA_LIB_EVENT_EXTRA}
${RDMA_LIB_EVENT_PTHREADS}
${RDMA_LIB_NUMA}
)
set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
return()
endif()
#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
find_program(
SWIG_BINARY_PATH
swig)
if(${SWIG_BINARY_PATH} STREQUAL "SWIG_BINARY_PATH-NOTFOUND")
set(SWIG_FOUND OFF)
else()
set(SWIG_FOUND ON)
endif()
set(MIN_SWIG_VERSION 2)
if(SWIG_FOUND)
execute_process(COMMAND sh -c "${SWIG_BINARY_PATH} -version | grep Version | cut -f3 -d' '"
OUTPUT_VARIABLE _SWIG_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
message("swig version ${MIN_SWIG_VERSION} or greater is needed for generating python api. "
"Only version ${_SWIG_VERSION} is found. Set SWIG_FOUND to FALSE")
set(SWIG_FOUND FALSE)
endif(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
endif(SWIG_FOUND)
function(generate_python_api target_name)
add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
${PROJ_ROOT}/paddle/Paddle_wrap.cxx
......@@ -27,6 +5,7 @@ function(generate_python_api target_name)
COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
&& mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
${PROJ_ROOT}/paddle/api/PaddleAPI.h
WORKING_DIRECTORY ${PROJ_ROOT}/paddle
COMMENT "Generate Python API from swig")
add_custom_target(${target_name} ALL DEPENDS
......
......@@ -67,6 +67,10 @@ endmacro()
#
# It will handle WITH_PYTHON/WITH_GLOG etc.
function(link_paddle_exe TARGET_NAME)
if(WITH_RDMA)
generate_rdma_links()
endif()
if(WITH_METRIC)
if(WITH_GPU)
set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
......@@ -109,6 +113,12 @@ function(link_paddle_exe TARGET_NAME)
${ZLIB_LIBRARIES}
${INTERAL_LIBS}
${CMAKE_DL_LIBS})
if(WITH_RDMA)
target_link_libraries(${TARGET_NAME}
${RDMA_LD_FLAGS}
${RDMA_LIBS})
endif()
if(WITH_PYTHON)
target_link_libraries(${TARGET_NAME}
......@@ -178,9 +188,18 @@ macro(add_simple_unittest TARGET_NAME)
add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
endmacro()
macro(add_paddle_culib TARGET_NAME)
set(NVCC_FLAG ${CUDA_NVCC_FLAGS})
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--use_fast_math)
cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
endmacro()
# Creates C resources file from files in given resource file
function(create_resources res_file output)
# Create empty output file
file(WRITE ${output} "")
# Get short filename
string(REGEX MATCH "([^/]+)$" filename ${res_file})
# Replace filename spaces & extension separator for C compatibility
string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
# Read hex data from file
file(READ ${res_file} filedata HEX)
# Convert hex data for C compatibility
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
# Append data to output file
file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
endfunction()
......@@ -5,3 +5,5 @@ plot.png
train.log
image_provider_copy_1.py
*pyc
train.list
test.list
文件模式从 100644 更改为 100755
......@@ -16,7 +16,6 @@ import numpy as np
import sys
import os
import PIL.Image as Image
"""
Usage: python process_cifar input_dir output_dir
"""
......@@ -30,6 +29,7 @@ def mkdir_not_exist(path):
if not os.path.exists(path):
os.mkdir(path)
def create_dir_structure(output_dir):
"""
Create the directory structure for the directory.
......@@ -39,8 +39,8 @@ def create_dir_structure(output_dir):
mkdir_not_exist(os.path.join(output_dir, "train"))
mkdir_not_exist(os.path.join(output_dir, "test"))
def convert_batch(batch_path, label_set, label_map,
output_dir, data_split):
def convert_batch(batch_path, label_set, label_map, output_dir, data_split):
"""
Convert CIFAR batch to the structure of Paddle format.
batch_path: the batch to be converted.
......@@ -67,11 +67,23 @@ if __name__ == '__main__':
output_dir = sys.argv[2]
num_batch = 5
create_dir_structure(output_dir)
label_map = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer",
5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}
label_map = {
0: "airplane",
1: "automobile",
2: "bird",
3: "cat",
4: "deer",
5: "dog",
6: "frog",
7: "horse",
8: "ship",
9: "truck"
}
labels = {}
for i in range(1, num_batch + 1):
convert_batch(os.path.join(input_dir, "data_batch_%d" % i), labels,
label_map, output_dir, "train")
convert_batch(os.path.join(input_dir, "test_batch"), {},
label_map, output_dir, "test")
\ No newline at end of file
convert_batch(
os.path.join(input_dir, "data_batch_%d" % i), labels, label_map,
output_dir, "train")
convert_batch(
os.path.join(input_dir, "test_batch"), {}, label_map, output_dir,
"test")
......@@ -46,36 +46,41 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
settings.img_mean = image_util.load_meta(settings.meta_path,
settings.mean_img_size,
settings.img_size,
settings.color)
settings.img_size, settings.color)
settings.logger.info('Image size: %s', settings.img_size)
settings.logger.info('Meta path: %s', settings.meta_path)
settings.input_types = [
dense_vector(settings.img_raw_size), # image feature
integer_value(settings.num_classes)] # labels
integer_value(settings.num_classes)
] # labels
settings.logger.info('DataProvider Initialization finished')
@provider(init_hook=hook)
def processData(settings, file_name):
@provider(init_hook=hook, min_pool_size=0)
def processData(settings, file_list):
"""
The main function for loading data.
Load the batch, iterate all the images and labels in this batch.
file_name: the batch file name.
file_list: the batch file list.
"""
data = cPickle.load(io.open(file_name, 'rb'))
indexes = list(range(len(data['images'])))
if settings.is_train:
random.shuffle(indexes)
for i in indexes:
if settings.use_jpeg == 1:
img = image_util.decode_jpeg(data['images'][i])
else:
img = data['images'][i]
img_feat = image_util.preprocess_img(img, settings.img_mean,
settings.img_size, settings.is_train,
settings.color)
label = data['labels'][i]
yield img_feat.tolist(), int(label)
with open(file_list, 'r') as fdata:
lines = [line.strip() for line in fdata]
random.shuffle(lines)
for file_name in lines:
with io.open(file_name.strip(), 'rb') as file:
data = cPickle.load(file)
indexes = list(range(len(data['images'])))
if settings.is_train:
random.shuffle(indexes)
for i in indexes:
if settings.use_jpeg == 1:
img = image_util.decode_jpeg(data['images'][i])
else:
img = data['images'][i]
img_feat = image_util.preprocess_img(
img, settings.img_mean, settings.img_size,
settings.is_train, settings.color)
label = data['labels'][i]
yield img_feat.astype('float32'), int(label)
......@@ -16,17 +16,20 @@ import numpy as np
from PIL import Image
from cStringIO import StringIO
def resize_image(img, target_size):
"""
Resize an image so that the shorter edge has length target_size.
img: the input image to be resized.
target_size: the target resized image size.
"""
percent = (target_size/float(min(img.size[0], img.size[1])))
resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent))
percent = (target_size / float(min(img.size[0], img.size[1])))
resized_size = int(round(img.size[0] * percent)), int(
round(img.size[1] * percent))
img = img.resize(resized_size, Image.ANTIALIAS)
return img
def flip(im):
"""
Return the flipped image.
......@@ -38,6 +41,7 @@ def flip(im):
else:
return im[:, ::-1]
def crop_img(im, inner_size, color=True, test=True):
"""
Return cropped image.
......@@ -50,20 +54,22 @@ def crop_img(im, inner_size, color=True, test=True):
If True, crop the center of images.
"""
if color:
height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2])
height, width = max(inner_size, im.shape[1]), max(inner_size,
im.shape[2])
padded_im = np.zeros((3, height, width))
startY = (height - im.shape[1]) / 2
startX = (width - im.shape[2]) / 2
endY, endX = startY + im.shape[1], startX + im.shape[2]
padded_im[:, startY: endY, startX: endX] = im
padded_im[:, startY:endY, startX:endX] = im
else:
im = im.astype('float32')
height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1])
height, width = max(inner_size, im.shape[0]), max(inner_size,
im.shape[1])
padded_im = np.zeros((height, width))
startY = (height - im.shape[0]) / 2
startX = (width - im.shape[1]) / 2
endY, endX = startY + im.shape[0], startX + im.shape[1]
padded_im[startY: endY, startX: endX] = im
padded_im[startY:endY, startX:endX] = im
if test:
startY = (height - inner_size) / 2
startX = (width - inner_size) / 2
......@@ -72,19 +78,21 @@ def crop_img(im, inner_size, color=True, test=True):
startX = np.random.randint(0, width - inner_size + 1)
endY, endX = startY + inner_size, startX + inner_size
if color:
pic = padded_im[:, startY: endY, startX: endX]
pic = padded_im[:, startY:endY, startX:endX]
else:
pic = padded_im[startY: endY, startX: endX]
pic = padded_im[startY:endY, startX:endX]
if (not test) and (np.random.randint(2) == 0):
pic = flip(pic)
return pic
def decode_jpeg(jpeg_string):
np_array = np.array(Image.open(StringIO(jpeg_string)))
if len(np_array.shape) == 3:
np_array = np.transpose(np_array, (2, 0, 1))
return np_array
def preprocess_img(im, img_mean, crop_size, is_train, color=True):
"""
Does data augmentation for images.
......@@ -99,6 +107,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
pic -= img_mean
return pic.flatten()
def load_meta(meta_path, mean_img_size, crop_size, color=True):
"""
Return the loaded meta file.
......@@ -109,17 +118,18 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
mean = np.load(meta_path)['data_mean']
border = (mean_img_size - crop_size) / 2
if color:
assert(mean_img_size * mean_img_size * 3 == mean.shape[0])
assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
mean = mean.reshape(3, mean_img_size, mean_img_size)
mean = mean[:, border: border + crop_size,
border: border + crop_size].astype('float32')
mean = mean[:, border:border + crop_size, border:border +
crop_size].astype('float32')
else:
assert(mean_img_size * mean_img_size == mean.shape[0])
assert (mean_img_size * mean_img_size == mean.shape[0])
mean = mean.reshape(mean_img_size, mean_img_size)
mean = mean[border: border + crop_size,
border: border + crop_size].astype('float32')
mean = mean[border:border + crop_size, border:border +
crop_size].astype('float32')
return mean
def load_image(img_path, is_color=True):
"""
Load image and return.
......@@ -130,6 +140,7 @@ def load_image(img_path, is_color=True):
img.load()
return img
def oversample(img, crop_dims):
"""
image : iterable of (H x W x K) ndarrays
......@@ -152,50 +163,53 @@ def oversample(img, crop_dims):
for j in w_indices:
crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
curr += 1
crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([
-crop_dims / 2.0,
crop_dims / 2.0
])
crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
[-crop_dims / 2.0, crop_dims / 2.0])
crops_ix = np.tile(crops_ix, (2, 1))
# Extract crops
crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1],
im_shape[-1]), dtype=np.float32)
crops = np.empty(
(10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
dtype=np.float32)
ix = 0
for im in img:
for crop in crops_ix:
crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
ix += 1
crops[ix-5:ix] = crops[ix-5:ix, :, ::-1, :] # flip for mirrors
crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :] # flip for mirrors
return crops
class ImageTransformer:
def __init__(self, transpose = None,
channel_swap = None, mean = None, is_color = True):
def __init__(self,
transpose=None,
channel_swap=None,
mean=None,
is_color=True):
self.transpose = transpose
self.channel_swap = None
self.mean = None
self.is_color = is_color
self.is_color = is_color
def set_transpose(self, order):
def set_transpose(self, order):
if self.is_color:
assert 3 == len(order)
assert 3 == len(order)
self.transpose = order
def set_channel_swap(self, order):
def set_channel_swap(self, order):
if self.is_color:
assert 3 == len(order)
assert 3 == len(order)
self.channel_swap = order
def set_mean(self, mean):
# mean value, may be one value per channel
if mean.ndim == 1:
mean = mean[:, np.newaxis, np.newaxis]
else:
mean = mean[:, np.newaxis, np.newaxis]
else:
# elementwise mean
if self.is_color:
assert len(mean.shape) == 3
self.mean = mean
self.mean = mean
def transformer(self, data):
if self.transpose is not None:
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os,sys
import os, sys
import numpy as np
import logging
from PIL import Image
......@@ -24,9 +24,11 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
class ImageClassifier():
def __init__(self,
train_conf,
......@@ -58,18 +60,19 @@ class ImageClassifier():
self.oversample = oversample
self.is_color = is_color
self.transformer = image_util.ImageTransformer(is_color = is_color)
self.transformer.set_transpose((2,0,1))
self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2, 0, 1))
self.mean_file = mean_file
mean = np.load(self.mean_file)['data_mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel
self.transformer.set_mean(mean) # mean pixel
gpu = 1 if use_gpu else 0
conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir)
......@@ -90,14 +93,14 @@ class ImageClassifier():
# image_util.resize_image: short side is self.resize_dim
image = image_util.resize_image(image, self.resize_dim)
image = np.array(image)
input = np.zeros((1, image.shape[0], image.shape[1], 3),
dtype=np.float32)
input = np.zeros(
(1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims)
else:
image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
dtype=np.float32)
input = np.zeros(
(1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32)
data_in = []
......@@ -133,22 +136,24 @@ class ImageClassifier():
lab = np.argsort(-prob)
logging.info("Label of %s is: %d", image, lab[0])
if __name__ == '__main__':
image_size=32
crop_size=32
multi_crop=True
config="vgg_16_cifar.py"
output_layer="__fc_layer_1__"
mean_path="data/cifar-out/batches/batches.meta"
model_path=sys.argv[1]
image=sys.argv[2]
use_gpu=bool(int(sys.argv[3]))
obj = ImageClassifier(train_conf=config,
model_dir=model_path,
resize_dim=image_size,
crop_dim=crop_size,
mean_file=mean_path,
use_gpu=use_gpu,
oversample=multi_crop)
image_size = 32
crop_size = 32
multi_crop = True
config = "vgg_16_cifar.py"
output_layer = "__fc_layer_1__"
mean_path = "data/cifar-out/batches/batches.meta"
model_path = sys.argv[1]
image = sys.argv[2]
use_gpu = bool(int(sys.argv[3]))
obj = ImageClassifier(
train_conf=config,
model_dir=model_path,
resize_dim=image_size,
crop_dim=crop_size,
mean_file=mean_path,
use_gpu=use_gpu,
oversample=multi_crop)
obj.predict(image, output_layer)
......@@ -19,22 +19,36 @@ from optparse import OptionParser
def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_dir [options]")
parser.add_option("-i", "--input", action="store",
dest="input", help="Input data directory.")
parser.add_option("-s", "--size", action="store",
dest="size", help="Processed image size.")
parser.add_option("-c", "--color", action="store",
dest="color", help="whether to use color images.")
parser.add_option(
"-i",
"--input",
action="store",
dest="input",
help="Input data directory.")
parser.add_option(
"-s",
"--size",
action="store",
dest="size",
help="Processed image size.")
parser.add_option(
"-c",
"--color",
action="store",
dest="color",
help="whether to use color images.")
return parser.parse_args()
if __name__ == '__main__':
options, args = option_parser()
data_dir = options.input
processed_image_size = int(options.size)
color = options.color == "1"
data_creator = ImageClassificationDatasetCreater(data_dir,
processed_image_size,
color)
data_creator.num_per_batch = 1000
data_creator.overwrite = True
data_creator.create_batches()
options, args = option_parser()
data_dir = options.input
processed_image_size = int(options.size)
color = options.color == "1"
data_creator = ImageClassificationDatasetCreater(
data_dir, processed_image_size, color)
data_creator.train_list_name = "train.txt"
data_creator.test_list_name = "test.txt"
data_creator.num_per_batch = 1000
data_creator.overwrite = True
data_creator.create_batches()
......@@ -17,3 +17,6 @@ set -e
data_dir=./data/cifar-out
python preprocess.py -i $data_dir -s 32 -c 1
echo "data/cifar-out/batches/train.txt" > train.list
echo "data/cifar-out/batches/test.txt" > test.list
......@@ -24,7 +24,7 @@ paddle train \
--test_all_data_in_one_period=1 \
--use_gpu=1 \
--trainer_count=1 \
--num_passes=200 \
--num_passes=300 \
--save_dir=$output \
2>&1 | tee $log
......
......@@ -18,36 +18,38 @@ is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir='data/cifar-out/batches/'
meta_path=data_dir+'batches.meta'
args = {'meta':meta_path,'mean_img_size': 32,
'img_size': 32,'num_classes': 10,
'use_jpeg': 1,'color': "color"}
define_py_data_sources2(train_list=data_dir+"train.list",
test_list=data_dir+'test.list',
module='image_provider',
obj='processData',
args=args)
data_dir = 'data/cifar-out/batches/'
meta_path = data_dir + 'batches.meta'
args = {
'meta': meta_path,
'mean_img_size': 32,
'img_size': 32,
'num_classes': 10,
'use_jpeg': 1,
'color': "color"
}
define_py_data_sources2(
train_list="train.list",
test_list="train.list",
module='image_provider',
obj='processData',
args=args)
######################Algorithm Configuration #############
settings(
batch_size = 128,
learning_rate = 0.1 / 128.0,
learning_method = MomentumOptimizer(0.9),
regularization = L2Regularization(0.0005 * 128)
)
batch_size=128,
learning_rate=0.1 / 128.0,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * 128))
#######################Network Configuration #############
data_size=3*32*32
label_size=10
img = data_layer(name='image',
size=data_size)
# small_vgg is predined in trainer_config_helpers.network
predict = small_vgg(input_image=img,
num_channels=3,
num_classes=label_size)
data_size = 3 * 32 * 32
label_size = 10
img = data_layer(name='image', size=data_size)
# small_vgg is predefined in trainer_config_helpers.networks
predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
......
This folder contains scripts used in PaddlePaddle introduction.
- use `bash train.sh` to train a simple linear regression model
- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
import random
# define data types of input: 2 real numbers
@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
def process(settings, input_file):
for i in xrange(2000):
x = random.random()
yield [x], [2 * x + 0.3]
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Print model parameters in last model
Usage:
python evaluate_model.py
"""
import numpy as np
import os
def load(file_name):
with open(file_name, 'rb') as f:
f.read(16) # skip header for float type.
return np.fromfile(f, dtype=np.float32)
def main():
print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
load('output/pass-00029/b'))
if __name__ == '__main__':
main()
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
paddle train \
--config=trainer_config.py \
--save_dir=./output \
--num_passes=30 \
2>&1 |tee 'train.log'
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
# 1. read data. Suppose you saved above python code as dataprovider.py
data_file = 'empty.list'
with open(data_file, 'w') as f:
f.writelines(' ')
define_py_data_sources2(
train_list=data_file,
test_list=None,
module='dataprovider',
obj='process',
args={})
# 2. learning algorithm
settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
# 3. Network configuration
x = data_layer(name='x', size=1)
y = data_layer(name='y', size=1)
y_predict = fc_layer(
input=x,
param_attr=ParamAttr(name='w'),
size=1,
act=LinearActivation(),
bias_attr=ParamAttr(name='b'))
cost = regression_cost(input=y_predict, label=y)
outputs(cost)
data/raw_data
data/*.list
mnist_vgg_model
plot.png
train.log
*pyc
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
o = open("./" + "train.list", "w")
o.write("./data/raw_data/train" + "\n")
o.close()
o = open("./" + "test.list", "w")
o.write("./data/raw_data/t10k" + "\n")
o.close()
#!/usr/bin/env sh
# This scripts downloads the mnist data and unzips it.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
rm -rf "$DIR/raw_data"
mkdir "$DIR/raw_data"
cd "$DIR/raw_data"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done
cd $DIR
rm -f *.list
python generate_list.py
from paddle.trainer.PyDataProvider2 import *
# Define a py data provider
@provider(
input_types={'pixel': dense_vector(28 * 28),
'label': integer_value(10)})
def process(settings, filename): # settings is not used currently.
imgf = filename + "-images-idx3-ubyte"
labelf = filename + "-labels-idx1-ubyte"
f = open(imgf, "rb")
l = open(labelf, "rb")
f.read(16)
l.read(8)
# Define number of samples for train/test
if "train" in filename:
n = 60000
else:
n = 10000
for i in range(n):
label = ord(l.read(1))
pixels = []
for j in range(28 * 28):
pixels.append(float(ord(f.read(1))) / 255.0)
yield {"pixel": pixels, 'label': label}
f.close()
l.close()
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
config=vgg_16_mnist.py
output=./mnist_vgg_model
log=train.log
paddle train \
--config=$config \
--dot_period=10 \
--log_period=100 \
--test_all_data_in_one_period=1 \
--use_gpu=0 \
--trainer_count=1 \
--num_passes=100 \
--save_dir=$output \
2>&1 | tee $log
python -m paddle.utils.plotcurve -i $log > plot.png
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir = './data/'
define_py_data_sources2(
train_list=data_dir + 'train.list',
test_list=data_dir + 'test.list',
module='mnist_provider',
obj='process')
######################Algorithm Configuration #############
settings(
batch_size=128,
learning_rate=0.1 / 128.0,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * 128))
#######################Network Configuration #############
data_size = 1 * 28 * 28
label_size = 10
img = data_layer(name='pixel', size=data_size)
# small_vgg is predined in trainer_config_helpers.network
predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
inputs(img, lbl)
outputs(classification_cost(input=predict, label=lbl))
else:
outputs(predict)
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python extract_para.py --preModel PREMODEL --preDict PREDICT \
......@@ -29,6 +28,7 @@ Options:
from optparse import OptionParser
import struct
def get_row_index(preDict, usrDict):
"""
Get the row positions for all words in user dictionary from pre-trained dictionary.
......@@ -47,7 +47,9 @@ def get_row_index(preDict, usrDict):
pos.append(index[word])
return pos
def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim):
def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict,
paraDim):
"""
Extract desired parameters from a pretrained embedding model based on user dictionary
"""
......@@ -70,6 +72,7 @@ def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim)
print "extract parameters finish, total", len(rowIndex), "lines"
fi.close()
def main():
"""
Main entry for running paraconvert.py
......@@ -78,19 +81,33 @@ def main():
"python %prog --preModel PREMODEL --preDict PREDICT" \
" --usrModel USRMODEL --usrDict USRDICT -d DIM"
parser = OptionParser(usage)
parser.add_option("--preModel", action="store", dest="preModel",
help="the name of pretrained embedding model")
parser.add_option("--preDict", action="store", dest="preDict",
help="the name of pretrained dictionary")
parser.add_option("--usrModel", action="store", dest="usrModel",
help="the name of output usr embedding model")
parser.add_option("--usrDict", action="store", dest="usrDict",
help="the name of user specified dictionary")
parser.add_option("-d", action="store", dest="dim",
help="dimension of parameter")
parser.add_option(
"--preModel",
action="store",
dest="preModel",
help="the name of pretrained embedding model")
parser.add_option(
"--preDict",
action="store",
dest="preDict",
help="the name of pretrained dictionary")
parser.add_option(
"--usrModel",
action="store",
dest="usrModel",
help="the name of output usr embedding model")
parser.add_option(
"--usrDict",
action="store",
dest="usrDict",
help="the name of user specified dictionary")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args()
extract_parameters_by_usrDict(options.preModel, options.preDict,
options.usrModel, options.usrDict, int(options.dim))
extract_parameters_by_usrDict(options.preModel, options.preDict,
options.usrModel, options.usrDict,
int(options.dim))
if __name__ == '__main__':
main()
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
......@@ -29,6 +28,7 @@ Options:
from optparse import OptionParser
import struct
def binary2text(input, output, paraDim):
"""
Convert a binary parameter file of embedding model to be a text file.
......@@ -76,12 +76,13 @@ def binary2text(input, output, paraDim):
fo.close()
print "binary2text finish, total", line, "lines"
def get_para_count(input):
"""
Compute the total number of embedding parameters in input text file.
input: the name of input text file
"""
numRows = 1
numRows = 1
paraDim = 0
with open(input) as f:
line = f.readline()
......@@ -90,6 +91,7 @@ def get_para_count(input):
numRows += 1
return numRows * paraDim
def text2binary(input, output, paddle_head=True):
"""
Convert a text parameter file of embedding model to be a binary file.
......@@ -123,6 +125,7 @@ def text2binary(input, output, paddle_head=True):
fo.close()
print "text2binary finish, total", count, "lines"
def main():
"""
Main entry for running paraconvert.py
......@@ -131,21 +134,26 @@ def main():
"python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
"python %prog --t2b -i INPUT -o OUTPUT"
parser = OptionParser(usage)
parser.add_option("--b2t", action="store_true",
help="convert parameter file of embedding model from binary to text")
parser.add_option("--t2b", action="store_true",
help="convert parameter file of embedding model from text to binary")
parser.add_option("-i", action="store", dest="input",
help="input parameter file name")
parser.add_option("-o", action="store", dest="output",
help="output parameter file name")
parser.add_option("-d", action="store", dest="dim",
help="dimension of parameter")
parser.add_option(
"--b2t",
action="store_true",
help="convert parameter file of embedding model from binary to text")
parser.add_option(
"--t2b",
action="store_true",
help="convert parameter file of embedding model from text to binary")
parser.add_option(
"-i", action="store", dest="input", help="input parameter file name")
parser.add_option(
"-o", action="store", dest="output", help="output parameter file name")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args()
if options.b2t:
binary2text(options.input, options.output, options.dim)
if options.t2b:
text2binary(options.input, options.output)
if __name__ == '__main__':
main()
......@@ -18,7 +18,5 @@ set -x
# download the dictionary and pretrained model
for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
do
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
done
......@@ -26,16 +26,22 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
class ImageClassifier():
def __init__(self, train_conf, model_dir=None,
resize_dim=256, crop_dim=224,
def __init__(self,
train_conf,
model_dir=None,
resize_dim=256,
crop_dim=224,
use_gpu=True,
mean_file=None,
output_layer=None,
oversample=False, is_color=True):
oversample=False,
is_color=True):
"""
train_conf: network configure.
model_dir: string, directory of model.
......@@ -62,24 +68,25 @@ class ImageClassifier():
assert isinstance(self.output_layer, basestring)
self.output_layer = self.output_layer.split(",")
self.transformer = image_util.ImageTransformer(is_color = is_color)
self.transformer.set_transpose((2,0,1))
self.transformer.set_channel_swap((2,1,0))
self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2, 0, 1))
self.transformer.set_channel_swap((2, 1, 0))
self.mean_file = mean_file
if self.mean_file is not None:
mean = np.load(self.mean_file)['data_mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel
self.transformer.set_mean(mean) # mean pixel
else:
# if you use three mean value, set like:
# this three mean value is calculated from ImageNet.
self.transformer.set_mean(np.array([103.939,116.779,123.68]))
self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu))
conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir)
......@@ -105,14 +112,14 @@ class ImageClassifier():
# image_util.resize_image: short side is self.resize_dim
image = image_util.resize_image(image, self.resize_dim)
image = np.array(image)
input = np.zeros((1, image.shape[0], image.shape[1], 3),
dtype=np.float32)
input = np.zeros(
(1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims)
else:
image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
dtype=np.float32)
input = np.zeros(
(1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32)
data_in = []
......@@ -172,7 +179,7 @@ class ImageClassifier():
logging.info("Label of %s is: %d", image, lab[0])
return results
def extract(self, data_file, output_dir, batch_size = 10000):
def extract(self, data_file, output_dir, batch_size=10000):
"""
extract and save features of output layers, which are
specify in Outputs() in network configure.
......@@ -197,7 +204,7 @@ class ImageClassifier():
image_feature[file_name] = feature
sample_num += 1
if sample_num == batch_size:
batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num)
batch_num += 1
......@@ -206,7 +213,7 @@ class ImageClassifier():
if idx % 1000 == 0:
logging.info('%d/%d, %s', idx, len(image_files), file_name)
if sample_num > 0:
batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num)
logging.info('Done: make image feature batch')
......@@ -215,38 +222,64 @@ class ImageClassifier():
of = open(file, 'wb')
cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
def option_parser():
"""
Main entry for predciting
"""
usage = "%prog -c config -i data_list -w model_dir [options]"
parser = OptionParser(usage="usage: %s" % usage)
parser.add_option("-j", "--job",
action="store", dest="job_type",
help="job type: predict, extract\
parser.add_option(
"-j",
"--job",
action="store",
dest="job_type",
help="job type: predict, extract\
predict: predicting,\
extract: extract features")
parser.add_option("-c", "--conf",
action="store", dest="train_conf",
help="network config")
parser.add_option("-i", "--data",
action="store", dest="data_file",
help="image list")
parser.add_option("-w", "--model",
action="store", dest="model_path",
default=None, help="model path")
parser.add_option("-g", "--use_gpu", action="store",
dest="use_gpu", default=True,
help="Whether to use gpu mode.")
parser.add_option("-o", "--output_dir",
action="store", dest="output_dir",
default="output", help="output path")
parser.add_option("-m", "--mean", action="store",
dest="mean", default=None,
help="mean file.")
parser.add_option("-p", "--multi_crop", action="store_true",
dest="multi_crop", default=False,
help="Wether to use multiple crops on image.")
parser.add_option(
"-c",
"--conf",
action="store",
dest="train_conf",
help="network config")
parser.add_option(
"-i", "--data", action="store", dest="data_file", help="image list")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
parser.add_option(
"-g",
"--use_gpu",
action="store",
dest="use_gpu",
default=True,
help="Whether to use gpu mode.")
parser.add_option(
"-o",
"--output_dir",
action="store",
dest="output_dir",
default="output",
help="output path")
parser.add_option(
"-m",
"--mean",
action="store",
dest="mean",
default=None,
help="mean file.")
parser.add_option(
"-p",
"--multi_crop",
action="store_true",
dest="multi_crop",
default=False,
help="Wether to use multiple crops on image.")
parser.add_option("-l", "--output_layer", action="store",
dest="output_layer", default=None,
help="--job=extract, specify layers to extract "\
......@@ -254,24 +287,26 @@ def option_parser():
"classification probability, output in resnet.py.")
return parser.parse_args()
def main():
"""
1. parse input arguments.
2. predicting or extract features according job type.
"""
options, args = option_parser()
obj = ImageClassifier(options.train_conf,
options.model_path,
use_gpu=options.use_gpu,
mean_file=options.mean,
output_layer=options.output_layer,
oversample=options.multi_crop)
obj = ImageClassifier(
options.train_conf,
options.model_path,
use_gpu=options.use_gpu,
mean_file=options.mean,
output_layer=options.output_layer,
oversample=options.multi_crop)
if options.job_type == "predict":
obj.predict(options.data_file)
elif options.job_type == "extract":
obj.extract(options.data_file,
options.output_dir)
obj.extract(options.data_file, options.output_dir)
if __name__ == '__main__':
main()
......@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -16,8 +16,7 @@ from paddle.utils.image_util import *
from paddle.trainer.PyDataProvider2 import *
def hook(settings, image_size, crop_size, color, file_list,
is_train, **kwargs):
def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs):
"""
Description: Init with a list of data file
file_list is the name list of input files.
......@@ -58,7 +57,7 @@ def hook(settings, image_size, crop_size, color, file_list,
sz = settings.crop_size * settings.crop_size
settings.img_mean = np.zeros(sz * 3, dtype=np.single)
for idx, value in enumerate(settings.mean_value):
settings.img_mean[idx * sz: (idx + 1) * sz] = value
settings.img_mean[idx * sz:(idx + 1) * sz] = value
settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
settings.crop_size)
......@@ -69,7 +68,8 @@ def hook(settings, image_size, crop_size, color, file_list,
settings.input_types = [
dense_vector(settings.img_input_size), # image feature
integer_value(1)] # labels
integer_value(1)
] # labels
settings.logger.info('Image short side: %s', settings.img_size)
settings.logger.info('Crop size: %s', settings.crop_size)
......@@ -97,9 +97,6 @@ def processData(settings, file_list):
# swap channel
if settings.is_swap_channel:
img = img[settings.swap_channel, :, :]
img_feat = preprocess_img(img,
settings.img_mean,
settings.crop_size,
settings.is_train,
settings.color)
img_feat = preprocess_img(img, settings.img_mean, settings.crop_size,
settings.is_train, settings.color)
yield img_feat.tolist(), int(lab.strip())
......@@ -24,9 +24,7 @@ echo "Downloading ResNet models..."
for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz
do
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
tar -xvf $file
rm $file
done
......
......@@ -17,9 +17,11 @@ import sys
import cPickle
import logging
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
def load_feature_c(file):
"""
Load feature extracted by C++ interface.
......@@ -30,14 +32,15 @@ def load_feature_c(file):
f = open(file, 'r')
for line in f:
sample = []
for slot in line.strip().split(";"):
fea = [float(val) for val in slot.strip().split()]
for slot in line.strip().split(";"):
fea = [float(val) for val in slot.strip().split()]
if fea:
sample.append(fea)
features.append(sample)
f.close()
return features
def load_feature_py(feature_dir):
"""
Load feature extracted by python interface.
......@@ -54,6 +57,7 @@ def load_feature_py(feature_dir):
logging.info('Load feature file %s', file_name)
return features
if __name__ == '__main__':
print load_feature_py(sys.argv[1])
print load_feature_py(sys.argv[1])
#print load_feature_c(sys.argv[1])
......@@ -13,7 +13,6 @@
# limitations under the License.
from paddle.trainer_config_helpers import *
"""
paper: https://arxiv.org/abs/1512.03385
"""
......@@ -28,15 +27,19 @@ if not is_predict and data_provider:
# mean.meta size : 3 x 224 x 224.
# If you use three mean value, set like:
# "mean_value:103.939,116.779,123.68;"
args={
args = {
'mean_meta': "model/mean_meta_224/mean.meta",
'image_size': 224, 'crop_size': 224,
'color': True,'swap_channel:': [2, 1, 0]}
define_py_data_sources2(train_list,
'example/test.list',
module="example.image_list_provider",
obj="processData",
args=args)
'image_size': 224,
'crop_size': 224,
'color': True,
'swap_channel:': [2, 1, 0]
}
define_py_data_sources2(
train_list,
'example/test.list',
module="example.image_list_provider",
obj="processData",
args=args)
batch_size = 1
learning_rate = 0.1 / batch_size
......@@ -54,12 +57,16 @@ Settings(
learning_method='momentum',
learning_rate_decay_a=0.5,
learning_rate_decay_b=1200000 * 10,
learning_rate_schedule="discexp",
)
learning_rate_schedule="discexp", )
def conv_bn_layer(name, input, filter_size, num_filters,
stride, padding, channels=None,
def conv_bn_layer(name,
input,
filter_size,
num_filters,
stride,
padding,
channels=None,
active_type=ReluActivation()):
"""
A wrapper for conv layer with batch normalization layers.
......@@ -67,19 +74,18 @@ def conv_bn_layer(name, input, filter_size, num_filters,
conv layer has no activation.
"""
tmp = img_conv_layer(name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(name=name + "_bn",
input=tmp,
act=active_type,
use_global_stats=is_test)
tmp = img_conv_layer(
name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(
name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
def bottleneck_block(name, input, num_filters1, num_filters2):
......@@ -88,29 +94,31 @@ def bottleneck_block(name, input, num_filters1, num_filters2):
Last conv_bn_layer has no activation.
Addto layer has activation of relu.
"""
last_name = conv_bn_layer(name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(name=name + "_addto",
input=[input, last_name],
act=ReluActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[input, last_name], act=ReluActivation())
def mid_projection(name, input, num_filters1, num_filters2, stride=2):
......@@ -123,38 +131,41 @@ def mid_projection(name, input, num_filters1, num_filters2, stride=2):
branch2x: bottleneck building block, shortcuts are identity.
"""
# stride = 2
branch1 = conv_bn_layer(name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(name=name + "_addto",
input=[branch1, last_name],
act=ReluActivation())
branch1 = conv_bn_layer(
name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
......@@ -168,67 +179,67 @@ def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
# For ImageNet
# conv1: 112x112
img = data_layer(name='input', size=224 * 224 * 3)
tmp = conv_bn_layer("conv1", img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = conv_bn_layer(
"conv1",
img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
# conv2_x: 56x56
tmp = mid_projection(name="res2_1",
input=tmp,
num_filters1=64,
num_filters2=256,
stride=1)
tmp = mid_projection(
name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
for i in xrange(2, res2_num + 1, 1):
tmp = bottleneck_block(name="res2_" + str(i),
input=tmp,
num_filters1=64,
num_filters2=256)
tmp = bottleneck_block(
name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
# conv3_x: 28x28
tmp = mid_projection(name="res3_1",
input=tmp,
num_filters1=128,
num_filters2=512)
tmp = mid_projection(
name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
for i in xrange(2, res3_num + 1, 1):
tmp = bottleneck_block(name="res3_" + str(i),
input=tmp, num_filters1=128,
num_filters2=512)
tmp = bottleneck_block(
name="res3_" + str(i),
input=tmp,
num_filters1=128,
num_filters2=512)
# conv4_x: 14x14
tmp = mid_projection(name="res4_1", input=tmp,
num_filters1=256, num_filters2=1024)
tmp = mid_projection(
name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
for i in xrange(2, res4_num + 1, 1):
tmp = bottleneck_block(name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
tmp = bottleneck_block(
name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
# conv5_x: 7x7
tmp = mid_projection(name="res5_1", input=tmp,
num_filters1=512, num_filters2=2048)
tmp = mid_projection(
name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
for i in xrange(2, res5_num + 1, 1):
tmp = bottleneck_block(name="res5_" + str(i),
input=tmp, num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
output = fc_layer(name='output',
input=tmp,
size=1000,
act=SoftmaxActivation())
tmp = bottleneck_block(
name="res5_" + str(i),
input=tmp,
num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(
name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
output = fc_layer(
name='output', input=tmp, size=1000, act=SoftmaxActivation())
if not is_predict:
classification_cost(input=output, label=data_layer(name='label',
size=1))
classification_cost(
input=output, label=data_layer(
name='label', size=1))
def res_net_50():
......
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import itertools
import random
from paddle.trainer.config_parser import parse_config
from py_paddle import swig_paddle as api
from py_paddle import DataProviderConverter
from paddle.trainer.PyDataProvider2 \
import integer_value, integer_value_sequence, sparse_binary_vector
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--train_data", type=str, required=False, help="train data file")
parser.add_argument("--test_data", type=str, help="test data file")
parser.add_argument(
"--config", type=str, required=True, help="config file name")
parser.add_argument("--dict_file", required=True, help="dictionary file")
parser.add_argument(
"--seq", default=1, type=int, help="whether use sequence training")
parser.add_argument(
"--use_gpu", default=0, type=int, help="whether use GPU for training")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Number of threads for training")
parser.add_argument(
"--num_passes", default=5, type=int, help="Number of training passes")
return parser.parse_args()
UNK_IDX = 0
def load_data(file_name, word_dict):
with open(file_name, 'r') as f:
for line in f:
label, comment = line.strip().split('\t')
words = comment.split()
word_slot = [word_dict.get(w, UNK_IDX) for w in words]
yield word_slot, int(label)
def load_dict(dict_file):
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
return word_dict
def main():
options = parse_arguments()
api.initPaddle("--use_gpu=%s" % options.use_gpu,
"--trainer_count=%s" % options.trainer_count)
word_dict = load_dict(options.dict_file)
train_dataset = list(load_data(options.train_data, word_dict))
if options.test_data:
test_dataset = list(load_data(options.test_data, word_dict))
else:
test_dataset = None
trainer_config = parse_config(options.config,
"dict_file=%s" % options.dict_file)
# No need to have data provider for trainer
trainer_config.ClearField('data_config')
trainer_config.ClearField('test_data_config')
# create a GradientMachine from the model configuratin
model = api.GradientMachine.createFromConfigProto(
trainer_config.model_config)
# create a trainer for the gradient machine
trainer = api.Trainer.create(trainer_config, model)
# create a data converter which converts data to PaddlePaddle
# internal format
input_types = [
integer_value_sequence(len(word_dict)) if options.seq else
sparse_binary_vector(len(word_dict)), integer_value(2)
]
converter = DataProviderConverter(input_types)
batch_size = trainer_config.opt_config.batch_size
trainer.startTrain()
for train_pass in xrange(options.num_passes):
trainer.startTrainPass()
random.shuffle(train_dataset)
for pos in xrange(0, len(train_dataset), batch_size):
batch = itertools.islice(train_dataset, pos, pos + batch_size)
size = min(batch_size, len(train_dataset) - pos)
trainer.trainOneDataBatch(size, converter(batch))
trainer.finishTrainPass()
if test_dataset:
trainer.startTestPeriod()
for pos in xrange(0, len(test_dataset), batch_size):
batch = itertools.islice(test_dataset, pos, pos + batch_size)
size = min(batch_size, len(test_dataset) - pos)
trainer.testOneDataBatch(size, converter(batch))
trainer.finishTestPeriod()
trainer.finishTrain()
if __name__ == '__main__':
main()
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# Note: if using trainer_config.emb.py, trainer_config.cnn.py
# or trainer_config.lstm.py, you need to change --seq to --seq=1
# because they are sequence models.
python api_train.py \
--config=trainer_config.lr.py \
--trainer_count=2 \
--num_passes=15 \
--use_gpu=0 \
--seq=0 \
--train_data=data/train.txt \
--test_data=data/test.txt \
--dict_file=data/dict.txt \
2>&1 | tee 'train.log'
......@@ -17,6 +17,7 @@ from paddle.trainer.PyDataProvider2 import *
# id of the word not in dictionary
UNK_IDX = 0
# initializer is called by the framework during initialization.
# It allows the user to describe the data types and setup the
# necessary data structure for later use.
......@@ -38,7 +39,9 @@ def initializer(settings, dictionary, **kwargs):
# The second input is an integer. It represents the category id of the
# sample. 2 means there are two labels in the dataset.
# (1 for positive and 0 for negative)
integer_value(2)]
integer_value(2)
]
# Delaring a data provider. It has an initializer 'data_initialzer'.
# It will cache the generated data of the first pass in memory, so that
......@@ -69,9 +72,8 @@ def process(settings, file_name):
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
sparse_binary_vector(len(dictionary))
]
settings.input_types = [sparse_binary_vector(len(dictionary))]
# Declaring a data provider for prediction. The difference with process
# is that label is not generated.
......@@ -79,6 +81,6 @@ def predict_initializer(settings, dictionary, **kwargs):
def process_predict(settings, file_name):
with open(file_name, 'r') as f:
for line in f:
comment = line.strip()
comment = line.strip().split()
word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield word_vector
......@@ -16,6 +16,7 @@ from paddle.trainer.PyDataProvider2 import *
UNK_IDX = 0
def initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
......@@ -23,7 +24,8 @@ def initializer(settings, dictionary, **kwargs):
# The value of the integers range from 0 to len(dictrionary)-1
integer_value_sequence(len(dictionary)),
# Define the second input for label id
integer_value(2)]
integer_value(2)
]
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
......@@ -39,7 +41,8 @@ def process(settings, file_name):
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
integer_value(len(dictionary), seq_type=SequenceType.SEQUENCE)
integer_value(
len(dictionary), seq_type=SequenceType.SEQUENCE)
]
......@@ -47,6 +50,6 @@ def predict_initializer(settings, dictionary, **kwargs):
def process_predict(settings, file_name):
with open(file_name, 'r') as f:
for line in f:
comment = line.strip()
comment = line.strip().split()
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield word_slot
......@@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
1. (remove HTML before or not)tokensizing
2. pos sample : rating score 5; neg sample: rating score 1-2.
......@@ -35,7 +34,8 @@ import multiprocessing
batch_size = 5000
word_count = {}
num_tokenize = max(1, multiprocessing.cpu_count() - 2) # parse + tokenize + save
num_tokenize = max(1,
multiprocessing.cpu_count() - 2) # parse + tokenize + save
max_queue_size = 8
parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
......
......@@ -20,13 +20,22 @@
set -e
export LC_ALL=C
UNAME_STR=`uname`
if [[ ${UNAME_STR} == 'Linux' ]]; then
SHUF_PROG='shuf'
else
SHUF_PROG='gshuf'
fi
mkdir -p data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz
# uniq and shuffle
cd data/tmp
echo 'uniq and shuffle...'
cat pos_*|sort|uniq|shuf> pos.shuffed
cat neg_*|sort|uniq|shuf> neg.shuffed
cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
min_len=`sed -n '$=' neg.shuffed`
test_num=$((min_len/10))
......@@ -40,8 +49,8 @@ head -n$train_num neg.shuffed >train.neg
tail -n$test_num pos.shuffed >test.pos
tail -n$test_num neg.shuffed >test.neg
cat train.pos train.neg|shuf>../train.txt
cat test.pos test.neg|shuf>../test.txt
cat train.pos train.neg | ${SHUF_PROG} >../train.txt
cat test.pos test.neg | ${SHUF_PROG} >../test.txt
cd -
echo 'data/train.txt' > data/train.list
......
......@@ -18,11 +18,14 @@ cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py
#cfg=trainer_config.bidi-lstm.py
#cfg=trainer_config.db-lstm.py
#cfg=trainer_config.resnet-lstm.py
paddle train \
--config=$cfg \
--save_dir=./output \
--trainer_count=4 \
--log_period=20 \
--log_period=100 \
--num_passes=15 \
--use_gpu=false \
--show_parameter_stats_period=100 \
......
# edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
bi_lstm = bidirectional_lstm(input=emb, size=128)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(
input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
......@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -39,8 +40,7 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
gradient_clipping_threshold=25)
data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128)
......
# edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
input_layers = [hidden_0, lstm_0]
for i in range(1, 8):
fc = fc_layer(input=input_layers, size=128)
lstm = lstmemory(
input=fc,
layer_attr=ExtraAttr(drop_rate=0.1),
reverse=(i % 2) == 1, )
input_layers = [fc, lstm]
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(
input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
......@@ -27,18 +27,16 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer()
)
batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer())
data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128)
......
......@@ -16,7 +16,7 @@
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
......@@ -32,11 +32,12 @@ process = 'process' if not is_predict else 'process_predict'
# We need to use different process for training and prediction.
# For training, the input data includes both word IDs and labels.
# For prediction, the input data only includs word Ids.
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_bow",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_bow",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -44,8 +45,7 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
gradient_clipping_threshold=25)
# Define the data for text features. The size of the data layer is the number
# of words in the dictionary.
......@@ -63,7 +63,6 @@ if not is_predict:
label = data_layer(name="label", size=2)
# Define cross-entropy classification loss and error.
classification_cost(input=output, label=label)
cls = classification_cost(input=output, label=label)
outputs(cls)
else:
......
......@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -39,24 +40,14 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
gradient_clipping_threshold=25)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
fc = fc_layer(input=emb, size=512,
act=LinearActivation(),
bias_attr=bias_attr,
layer_attr=ExtraAttr(drop_rate=0.1))
lstm = lstmemory(input=fc, act=TanhActivation(),
bias_attr=bias_attr,
layer_attr=ExtraAttr(drop_rate=0.25))
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_last, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
lstm = simple_lstm(
input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25))
lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
......
# edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This configuration is a demonstration of how to implement the stacked LSTM
with residual connections, i.e. an LSTM layer takes the sum of the hidden states
and inputs of the previous LSTM layer instead of only the hidden states.
This architecture is from:
Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa,
Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. 2016.
Google's Neural Machine Translation System: Bridging the Gap between Human and
Machine Translation. In arXiv https://arxiv.org/pdf/1609.08144v2.pdf
Different from the architecture described in the paper, we use a stack single
direction LSTM layers as the first layer instead of bi-directional LSTM. Also,
since this is a demo code, to reduce computation time, we stacked 4 layers
instead of 8 layers.
"""
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
lstm = simple_lstm(input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
previous_input, previous_hidden_state = emb, lstm
for i in range(3):
# The input to the current layer is the sum of the hidden state
# and input of the previous layer.
current_input = addto_layer(input=[previous_input, previous_hidden_state])
hidden_state = simple_lstm(input=current_input, size=128,
lstm_cell_attr=ExtraAttr(drop_rate=0.1))
previous_input, previous_hidden_state = current_input, hidden_state
lstm = previous_hidden_state
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_last, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
......@@ -21,8 +21,9 @@ def meta_to_header(meta, name):
yield integer_value(each_meta['max'])
elif each_meta['type'] == 'embedding':
is_seq = each_meta['seq'] == 'sequence'
yield integer_value(len(each_meta['dict']),
seq_type=SequenceType.SEQUENCE if is_seq
else SequenceType.NO_SEQUENCE)
yield integer_value(
len(each_meta['dict']),
seq_type=SequenceType.SEQUENCE
if is_seq else SequenceType.NO_SEQUENCE)
elif each_meta['type'] == 'one_hot_dense':
yield dense_vector(len(each_meta['dict']))
......@@ -14,4 +14,3 @@
"fields": ["id", "title", "genres"]
}
}
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
config_generator.py
......@@ -29,10 +28,7 @@ import json
import docopt
import copy
DEFAULT_FILE = {
"type": "split",
"delimiter": ","
}
DEFAULT_FILE = {"type": "split", "delimiter": ","}
DEFAULT_FIELD = {
"id": {
......@@ -107,19 +103,16 @@ def main(filename, fmt):
field = copy.deepcopy(DEFAULT_FIELD[field_key])
field['pos'] = pos
fields.append(field)
obj[k] = {
"file": file_dict,
"fields": fields
}
meta = {
"meta": obj
}
obj[k] = {"file": file_dict, "fields": fields}
meta = {"meta": obj}
# print meta
if fmt == 'json':
def formatter(x):
import json
return json.dumps(x, indent=2)
elif fmt == 'yaml':
def formatter(x):
import yaml
return yaml.safe_dump(x, default_flow_style=False)
......
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess Movielens dataset, to get movie/user object.
......@@ -66,8 +65,8 @@ class SortedIDGenerator(object):
self.__key_set__.add(key)
def finish_scan(self, compare=None, key=None, reverse=False):
self.__key_set__ = sorted(list(self.__key_set__), cmp=compare,
key=key, reverse=reverse)
self.__key_set__ = sorted(
list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
self.dict = dict()
for idx, each_key in enumerate(self.__key_set__):
self.dict[each_key] = idx
......@@ -207,11 +206,10 @@ class EmbeddingFieldParser(object):
self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
self.seq_type == EmbeddingFieldParser.SEQUENCE)
elif config['dict']['type'] == 'split':
self.dict = SplitEmbeddingDict(
config['dict'].get('delimiter', ','))
self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
elif config['dict']['type'] == 'whole_content':
self.dict = EmbeddingFieldParser.WholeContentDict(
config['dict']['sort'])
self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][
'sort'])
else:
print config
assert False
......@@ -333,8 +331,8 @@ class ContentExtractorFactory(object):
return PositionContentExtractor(config['pos'])
else:
extra_args = config['regex']
return RegexPositionContentExtractor(pos=config['pos'],
**extra_args)
return RegexPositionContentExtractor(
pos=config['pos'], **extra_args)
class MetaFile(object):
......@@ -364,9 +362,10 @@ class MetaFile(object):
metas = map(lambda x: x.meta_field(), field_parsers)
# print metas
key_index = filter(lambda x: x is not None, map(
lambda (idx, meta): idx if 'is_key' in meta and meta['is_key']
else None, enumerate(metas)))[0]
key_index = filter(
lambda x: x is not None,
map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
enumerate(metas)))[0]
key_map = []
for i in range(min(key_index, len(metas))):
......@@ -374,12 +373,7 @@ class MetaFile(object):
for i in range(key_index + 1, len(metas)):
key_map.append(i)
obj = {
'__meta__': {
'raw_meta': metas,
'feature_map': key_map
}
}
obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
for each_block in reader.read():
idx = field_parsers[key_index].parse(each_block)
......
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Separate movielens 1m dataset to train/test file.
......
......@@ -15,6 +15,7 @@
from paddle.trainer.PyDataProvider2 import *
import common_utils # parse
def hook(settings, meta, **kwargs):
"""
Init hook is invoked before process data. It will set obj.slots and store
......@@ -41,6 +42,7 @@ def hook(settings, meta, **kwargs):
settings.input_types = headers
settings.meta = meta
@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename):
with open(filename, 'r') as f:
......
......@@ -28,7 +28,8 @@ if __name__ == '__main__':
model_path = sys.argv[1]
swig_paddle.initPaddle('--use_gpu=0')
conf = parse_config("trainer_config.py", "is_predict=1")
network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(network, swig_paddle.GradientMachine)
network.loadParameters(model_path)
with open('./data/meta.bin', 'rb') as f:
......@@ -39,11 +40,12 @@ if __name__ == '__main__':
while True:
movie_id = int(raw_input("Input movie_id: "))
user_id = int(raw_input("Input user_id: "))
movie_meta = meta['movie'][movie_id] # Query Data From Meta.
movie_meta = meta['movie'][movie_id] # Query Data From Meta.
user_meta = meta['user'][user_id]
data = [movie_id - 1]
data.extend(movie_meta)
data.append(user_id - 1)
data.extend(user_meta)
print "Prediction Score is %.2f" % ((network.forwardTest(
cvt.convert([data]))[0]['value'][0][0] + 5) / 2)
print "Prediction Score is %.2f" % (
(network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5)
/ 2)
......@@ -27,8 +27,8 @@ with open(META_FILE, 'rb') as f:
# load meta file
meta = pickle.load(f)
settings(batch_size=1600, learning_rate=1e-3,
learning_method=RMSPropOptimizer())
settings(
batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
def construct_feature(name):
......@@ -59,11 +59,10 @@ def construct_feature(name):
slot_name = each_meta.get('name', '%s_id' % name)
if type_name == 'id':
slot_dim = each_meta['max']
embedding = embedding_layer(input=data_layer(slot_name,
size=slot_dim),
size=256)
fusion.append(fc_layer(input=embedding,
size=256))
embedding = embedding_layer(
input=data_layer(
slot_name, size=slot_dim), size=256)
fusion.append(fc_layer(input=embedding, size=256))
elif type_name == 'embedding':
is_seq = each_meta['seq'] == 'sequence'
slot_dim = len(each_meta['dict'])
......@@ -71,17 +70,14 @@ def construct_feature(name):
embedding = embedding_layer(input=din, size=256)
if is_seq:
fusion.append(
text_conv_pool(input=embedding, context_len=5,
hidden_size=256))
text_conv_pool(
input=embedding, context_len=5, hidden_size=256))
else:
fusion.append(fc_layer(input=embedding,
size=256))
fusion.append(fc_layer(input=embedding, size=256))
elif type_name == 'one_hot_dense':
slot_dim = len(each_meta['dict'])
hidden = fc_layer(input=data_layer(slot_name, slot_dim),
size=256)
fusion.append(fc_layer(input=hidden,
size=256))
hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256)
fusion.append(fc_layer(input=hidden, size=256))
return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
......@@ -90,10 +86,16 @@ movie_feature = construct_feature("movie")
user_feature = construct_feature("user")
similarity = cos_sim(a=movie_feature, b=user_feature)
if not is_predict:
outputs(regression_cost(input=similarity,
label=data_layer('rating', size=1)))
define_py_data_sources2('data/train.list', 'data/test.list', module='dataprovider',
obj='process', args={'meta': meta})
outputs(
regression_cost(
input=similarity, label=data_layer(
'rating', size=1)))
define_py_data_sources2(
'data/train.list',
'data/test.list',
module='dataprovider',
obj='process',
args={'meta': meta})
else:
outputs(similarity)
*.pyc
train.log
data/feature
data/conll05st-release/
data/src.dict
data/test.wsj.props
data/test.wsj.seq_pair
data/test.wsj.words
data/tgt.dict
output
......@@ -26,9 +26,9 @@ def hook(settings, word_dict, label_dict, **kwargs):
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(2),
integer_value_sequence(len(label_dict))]
integer_value_sequence(len(word_dict)), integer_value_sequence(2),
integer_value_sequence(len(label_dict))
]
@provider(init_hook=hook)
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import os
import sys
......@@ -42,7 +41,7 @@ if not is_predict:
label_dict[w] = i
if is_test:
train_list_file = None
train_list_file = None
#define data provider
define_py_data_sources2(
......
......@@ -41,22 +41,16 @@ class Prediction():
len_dict = len(self.dict)
len_label = len(self.labels)
conf = parse_config(
train_conf,
'dict_len=' + str(len_dict) +
',label_len=' + str(len_label) +
',is_predict=True')
conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
',label_len=' + str(len_label) + ',is_predict=True')
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(model_dir)
slots = [
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(2)
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(2)
]
self.converter = DataProviderConverter(slots)
......@@ -110,8 +104,8 @@ class Prediction():
len_sen = len(sen.split())
line_labels = lab[index:index + len_sen]
index += len_sen
fout.write(sen + '\t' + ' '.join([self.labels_reverse[
i] for i in line_labels]) + '\n')
fout.write(sen + '\t' + ' '.join(
[self.labels_reverse[i] for i in line_labels]) + '\n')
def option_parser():
......
......@@ -18,7 +18,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
sort | head -n 1
sort -n | head -n 1
}
log=train.log
......@@ -26,7 +26,6 @@ LOG=`get_best_pass $log`
LOG=(${LOG})
best_model_path="output/pass-${LOG[1]}"
config_file=db_lstm.py
dict_file=./data/src.dict
label_file=./data/tgt.dict
......
......@@ -18,7 +18,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
sort | head -n 1
sort -n | head -n 1
}
log=train.log
......@@ -36,5 +36,5 @@ paddle train \
--job=test \
--use_gpu=false \
--config_args=is_test=1 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'test.log'
......@@ -24,4 +24,3 @@ paddle train \
--show_parameter_stats_period=10 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
......@@ -17,8 +17,8 @@ from paddle.trainer.PyDataProvider2 import *
def hook(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
integer_value_sequence(len(settings.word_dict)),
integer_value(2)]
integer_value_sequence(len(settings.word_dict)), integer_value(2)
]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
......@@ -29,6 +29,7 @@ def process(settings, file_name):
label, comment = line.strip().split('\t\t')
label = int(label)
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if w in
settings.word_dict]
word_slot = [
settings.word_dict[w] for w in words if w in settings.word_dict
]
yield word_slot, label
......@@ -18,14 +18,14 @@ from optparse import OptionParser
from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import integer_value_sequence
from paddle.trainer.config_parser import parse_config
"""
Usage: run following command to show help message.
python predict.py -h
"""
class SentimentPrediction():
def __init__(self, train_conf, dict_file, model_dir=None, label_file = None):
def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
"""
train_conf: trainer configure.
dict_file: word dictionary file name.
......@@ -44,10 +44,11 @@ class SentimentPrediction():
self.load_label(label_file)
conf = parse_config(train_conf, "is_predict=1")
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(self.model_dir)
slots = [integer_value_sequence(self.dict_dim)]
self.converter = DataProviderConverter(slots)
input_types = [integer_value_sequence(self.dict_dim)]
self.converter = DataProviderConverter(input_types)
def load_dict(self):
"""
......@@ -61,7 +62,7 @@ class SentimentPrediction():
"""
Load label.
"""
self.label={}
self.label = {}
for v in open(label_file, 'r'):
self.label[int(v.split('\t')[1])] = v.split('\t')[0]
......@@ -72,7 +73,9 @@ class SentimentPrediction():
with open(data_file, 'r') as fdata:
for line in fdata:
words = line.strip().split()
word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
word_slot = [
self.word_dict[w] for w in words if w in self.word_dict
]
if not word_slot:
print "all words are not in dictionary: %s", line
continue
......@@ -89,25 +92,48 @@ class SentimentPrediction():
if self.label is None:
print("%s: predicting label is %d" % (data_file, lab[0][0]))
else:
print("%s: predicting label is %s" % (data_file, self.label[lab[0][0]]))
print("%s: predicting label is %s" %
(data_file, self.label[lab[0][0]]))
def option_parser():
usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option("-n", "--tconf", action="store",
dest="train_conf", help="network config")
parser.add_option("-d", "--dict", action="store",
dest="dict_file",help="dictionary file")
parser.add_option("-b", "--label", action="store",
dest="label", default=None,
help="dictionary file")
parser.add_option("-i", "--data", action="store",
dest="data", help="data file to predict")
parser.add_option("-w", "--model", action="store",
dest="model_path", default=None,
help="model path")
parser.add_option(
"-n",
"--tconf",
action="store",
dest="train_conf",
help="network config")
parser.add_option(
"-d",
"--dict",
action="store",
dest="dict_file",
help="dictionary file")
parser.add_option(
"-b",
"--label",
action="store",
dest="label",
default=None,
help="dictionary file")
parser.add_option(
"-i",
"--data",
action="store",
dest="data",
help="data file to predict")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
return parser.parse_args()
def main():
options, args = option_parser()
train_conf = options.train_conf
......@@ -119,5 +145,6 @@ def main():
predict = SentimentPrediction(train_conf, dict_file, model_path, label)
predict.predict(data)
if __name__ == '__main__':
main()
......@@ -22,13 +22,13 @@ from os.path import join as join_path
from optparse import OptionParser
from paddle.utils.preprocess_util import *
"""
Usage: run following command to show help message.
python preprocess.py -h
"""
def save_dict(dict, filename, is_reverse = True):
def save_dict(dict, filename, is_reverse=True):
"""
Save dictionary into file.
dict: input dictionary.
......@@ -39,9 +39,10 @@ def save_dict(dict, filename, is_reverse = True):
f = open(filename, 'w')
for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
reverse=is_reverse):
f.write('%s\t%s\n'%(k, v))
f.write('%s\t%s\n' % (k, v))
f.close()
def tokenize(sentences):
"""
Use tokenizer.perl to tokenize input sentences.
......@@ -58,6 +59,7 @@ def tokenize(sentences):
toks = tok_text.split('\n')[:-1]
return toks
def read_lines(path):
"""
path: String, file path.
......@@ -71,12 +73,17 @@ def read_lines(path):
seqs.append(line)
return seqs
class SentimentDataSetCreate():
"""
A class to process data for sentiment analysis task.
"""
def __init__(self, data_path, output_path,
use_okenizer = True, multi_lines = False):
def __init__(self,
data_path,
output_path,
use_okenizer=True,
multi_lines=False):
"""
data_path: string, traing and testing dataset path
output_path: string, output path, store processed dataset
......@@ -164,23 +171,17 @@ class SentimentDataSetCreate():
# Preprocess train data.
train_data, train_lab_set = self.data_list(self.train_dir)
print "processing train set..."
file_lists = self.save_data(train_data,
"train",
self.batch_size,
True,
True)
file_lists = self.save_data(train_data, "train", self.batch_size, True,
True)
save_list(file_lists, self.train_list)
# If have test data path, preprocess test data.
if os.path.exists(self.test_dir):
test_data, test_lab_set = self.data_list(self.test_dir)
assert(train_lab_set == test_lab_set)
assert (train_lab_set == test_lab_set)
print "processing test set..."
file_lists = self.save_data(test_data,
"test",
self.batch_size,
False,
self.dict_with_test)
file_lists = self.save_data(test_data, "test", self.batch_size,
False, self.dict_with_test)
save_list(file_lists, self.test_list)
# save labels set.
......@@ -191,7 +192,9 @@ class SentimentDataSetCreate():
save_dict(self.word_count, self.dict_file, True)
self.dict_size = len(self.word_count)
def save_data(self, data, prefix = "",
def save_data(self,
data,
prefix="",
batch_size=50000,
is_shuffle=False,
build_dict=False):
......@@ -205,7 +208,8 @@ class SentimentDataSetCreate():
return: list of batch names
"""
if is_shuffle and self.multi_lines:
return self.save_data_multi_lines(data, prefix, batch_size, build_dict)
return self.save_data_multi_lines(data, prefix, batch_size,
build_dict)
if is_shuffle:
random.shuffle(data)
......@@ -213,7 +217,7 @@ class SentimentDataSetCreate():
batch_names = []
for i in range(num_batches):
batch_name = join_path(self.output_path,
"%s_part_%03d" %(prefix, i))
"%s_part_%03d" % (prefix, i))
begin = i * batch_size
end = min((i + 1) * batch_size, len(data))
# read a batch of data
......@@ -246,7 +250,9 @@ class SentimentDataSetCreate():
data_list = tokenize(data_list)
return label_list, data_list
def save_data_multi_lines(self, data, prefix = "",
def save_data_multi_lines(self,
data,
prefix="",
batch_size=50000,
build_dict=False):
"""
......@@ -274,14 +280,14 @@ class SentimentDataSetCreate():
self.create_dict(data_list)
length = len(label_list)
perm_list = np.array([ i for i in xrange(length) ])
perm_list = np.array([i for i in xrange(length)])
random.shuffle(perm_list)
num_batches = int(math.ceil(length / float(batch_size)))
batch_names = []
for i in range(num_batches):
batch_name = join_path(self.output_path,
"%s_part_%03d" %(prefix, i))
"%s_part_%03d" % (prefix, i))
begin = i * batch_size
end = min((i + 1) * batch_size, length)
sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
......@@ -304,35 +310,50 @@ class SentimentDataSetCreate():
f.write('%s\t\t%s\n' % (lab, seq))
f.close()
def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_dir [options]")
parser.add_option("-i", "--data", action="store",
dest="input", help="Input data directory.")
parser.add_option("-o", "--output", action="store",
dest="output", default=None,
help="Output directory.")
parser.add_option("-t", "--tokenizer", action="store",
dest="use_tokenizer", default=True,
help="Whether to use tokenizer.")
parser.add_option(
"-i",
"--data",
action="store",
dest="input",
help="Input data directory.")
parser.add_option(
"-o",
"--output",
action="store",
dest="output",
default=None,
help="Output directory.")
parser.add_option(
"-t",
"--tokenizer",
action="store",
dest="use_tokenizer",
default=True,
help="Whether to use tokenizer.")
parser.add_option("-m", "--multi_lines", action="store",
dest="multi_lines", default=False,
help="If input text files have multi lines and they "\
"need to be shuffled, you should set -m True,")
return parser.parse_args()
def main():
options, args = option_parser()
data_dir=options.input
output_dir=options.output
use_tokenizer=options.use_tokenizer
multi_lines=options.multi_lines
data_dir = options.input
output_dir = options.output
use_tokenizer = options.use_tokenizer
multi_lines = options.multi_lines
if output_dir is None:
outname = os.path.basename(options.input)
output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
data_creator = SentimentDataSetCreate(data_dir, output_dir,
use_tokenizer, multi_lines)
data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
multi_lines)
data_creator.create_dataset()
if __name__ == '__main__':
main()
......@@ -47,10 +47,12 @@ def sentiment_data(data_dir=None,
for i, line in enumerate(open(dict_file, 'r')):
word_dict[line.split('\t')[0]] = i
define_py_data_sources2(train_list, test_list,
module="dataprovider",
obj="process",
args={'dictionary': word_dict})
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={'dictionary': word_dict})
return dict_dim, class_dim
......@@ -64,8 +66,7 @@ def bidirectional_lstm_net(input_dim,
emb = embedding_layer(input=data, size=emb_dim)
bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=class_dim,
act=SoftmaxActivation())
output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
if not is_predict:
lbl = data_layer("label", 1)
......@@ -109,27 +110,36 @@ def stacked_lstm_net(input_dim,
data = data_layer("word", input_dim)
emb = embedding_layer(input=data, size=emb_dim)
fc1 = fc_layer(input=emb, size=hid_dim, act=linear,
bias_attr=bias_attr)
lstm1 = lstmemory(input=fc1, act=relu, bias_attr=bias_attr,
layer_attr=layer_attr)
fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
lstm1 = lstmemory(
input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
inputs = [fc1, lstm1]
for i in range(2, stacked_num + 1):
fc = fc_layer(input=inputs, size=hid_dim, act=linear,
param_attr=para_attr, bias_attr=bias_attr)
lstm = lstmemory(input=fc, reverse=(i % 2) == 0, act=relu,
bias_attr=bias_attr, layer_attr=layer_attr)
fc = fc_layer(
input=inputs,
size=hid_dim,
act=linear,
param_attr=para_attr,
bias_attr=bias_attr)
lstm = lstmemory(
input=fc,
reverse=(i % 2) == 0,
act=relu,
bias_attr=bias_attr,
layer_attr=layer_attr)
inputs = [fc, lstm]
fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
output = fc_layer(input=[fc_last, lstm_last], size=class_dim,
act=SoftmaxActivation(),
bias_attr=bias_attr, param_attr=para_attr)
output = fc_layer(
input=[fc_last, lstm_last],
size=class_dim,
act=SoftmaxActivation(),
bias_attr=bias_attr,
param_attr=para_attr)
if is_predict:
outputs(output)
else:
outputs(
classification_cost(input=output, label=data_layer('label', 1)))
outputs(classification_cost(input=output, label=data_layer('label', 1)))
......@@ -17,7 +17,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
sort | head -n 1
sort -n | head -n 1
}
log=train.log
......
......@@ -20,20 +20,19 @@ is_test = get_config_arg('is_test', bool, False)
# whether this config is used for prediction
is_predict = get_config_arg('is_predict', bool, False)
data_dir = "./data/pre-imdb"
data_dir = "./data/pre-imdb"
dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
################## Algorithm Config #####################
settings(
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
#################### Network Config ######################
stacked_lstm_net(dict_dim, class_dim=class_dim,
stacked_num=3, is_predict=is_predict)
stacked_lstm_net(
dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
......@@ -16,9 +16,7 @@ set -e
set -x
# download the in-house paraphrase dataset
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
# untar the dataset
tar -zxvf paraphrase.tar.gz
......
......@@ -16,9 +16,7 @@ set -e
set -x
# download the pretrained model
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
# untar the model
tar -zxvf wmt14_model.tar.gz
......
......@@ -30,14 +30,14 @@ def hook(settings, src_dict, trg_dict, file_list, **kwargs):
if settings.job_mode:
settings.trg_dict = trg_dict
settings.slots = [
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.trg_dict))
]
settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
else:
settings.slots = [
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(open(file_list[0], "r").readlines()))
]
......@@ -62,8 +62,7 @@ def process(settings, file_name):
if settings.job_mode:
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [settings.trg_dict.get(w, UNK_IDX)
for w in trg_words]
trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80:
......
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python preprocess.py -i INPUT [-d DICTSIZE] [-m]
......@@ -24,12 +23,13 @@ Options:
-m --mergeDict merge source and target dictionary
"""
import os
import sys
import sys
import string
from optparse import OptionParser
from paddle.utils.preprocess_util import save_list, DatasetCreater
class SeqToSeqDatasetCreater(DatasetCreater):
"""
A class to process data for sequence to sequence application.
......@@ -75,7 +75,7 @@ class SeqToSeqDatasetCreater(DatasetCreater):
if not os.path.exists(output):
os.system(cmd + '> ' + output)
def build_dict(self, file_path, dict_path, dict_size = -1):
def build_dict(self, file_path, dict_path, dict_size=-1):
"""
Create the dictionary for the file, Note that
1. Valid characters include all printable characters
......@@ -99,20 +99,23 @@ class SeqToSeqDatasetCreater(DatasetCreater):
for word in words:
if word not in dictory:
dictory[word] = 1
else:
else:
dictory[word] += 1
output = open(dict_path, "w+")
output.write('<s>\n<e>\n<unk>\n')
count = 3
for key, value in sorted(dictory.items(), key = lambda d:d[1], reverse = True):
for key, value in sorted(
dictory.items(), key=lambda d: d[1], reverse=True):
output.write(key + "\n")
count += 1
if count == dict_size:
break
self.dict_size = count
def create_dataset(self, dict_size = -1, mergeDict = False,
suffixes = ['.src', '.trg']):
def create_dataset(self,
dict_size=-1,
mergeDict=False,
suffixes=['.src', '.trg']):
"""
Create seqToseq dataset
"""
......@@ -135,13 +138,14 @@ class SeqToSeqDatasetCreater(DatasetCreater):
# checkout dataset should be parallel corpora
suffix_len = len(suffixes[0])
for dataset in dataset_list:
file_list = os.listdir(dataset)
if len(file_list) % 2 == 1:
raise RuntimeError("dataset should be parallel corpora")
file_list.sort()
for i in range(0, len(file_list), 2):
if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
raise RuntimeError("source and target file name should be equal")
file_list = os.listdir(dataset)
if len(file_list) % 2 == 1:
raise RuntimeError("dataset should be parallel corpora")
file_list.sort()
for i in range(0, len(file_list), 2):
if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
raise RuntimeError(
"source and target file name should be equal")
# cat all the files with the same suffix in dataset
for suffix in suffixes:
......@@ -155,16 +159,18 @@ class SeqToSeqDatasetCreater(DatasetCreater):
list = ['train.list', 'test.list', 'gen.list']
for dataset in dataset_list:
outname = os.path.basename(dataset)
self.concat_file(dataset, outname + suffixes[0],
self.concat_file(dataset, outname + suffixes[0],
outname + suffixes[1], dir_list[id], outname)
save_list([os.path.join(dir_list[id], outname)],
save_list([os.path.join(dir_list[id], outname)],
os.path.join(self.output_path, list[id]))
id += 1
# build dictionary for train data
dict = ['src.dict', 'trg.dict']
dict_path = [os.path.join(self.output_path, dict[0]),
os.path.join(self.output_path, dict[1])]
dict_path = [
os.path.join(self.output_path, dict[0]),
os.path.join(self.output_path, dict[1])
]
if mergeDict:
outname = os.path.join(train_dir, train_dataset.split('/')[-1])
print 'build src dictionary for train data'
......@@ -173,22 +179,30 @@ class SeqToSeqDatasetCreater(DatasetCreater):
os.system('cp ' + dict_path[0] + ' ' + dict_path[1])
else:
outname = os.path.join(train_dataset, self.train_dir_name)
for id in range(0,2):
for id in range(0, 2):
suffix = suffixes[id]
print 'build ' + suffix[1:] + ' dictionary for train data'
self.build_dict(outname + suffix, dict_path[id], dict_size)
print 'dictionary size is', self.dict_size
def main():
usage = "usage: \n" \
"python %prog -i INPUT [-d DICTSIZE] [-m]"
parser = OptionParser(usage)
parser.add_option("-i", action="store", dest="input",
help="input original dataset path")
parser.add_option("-d", action="store", dest="dictsize",
help="specified word count of dictionary")
parser.add_option("-m", "--mergeDict", action="store_true", dest="mergeDict",
help="merge source and target dictionary")
parser.add_option(
"-i", action="store", dest="input", help="input original dataset path")
parser.add_option(
"-d",
action="store",
dest="dictsize",
help="specified word count of dictionary")
parser.add_option(
"-m",
"--mergeDict",
action="store_true",
dest="mergeDict",
help="merge source and target dictionary")
(options, args) = parser.parse_args()
if options.input[-1] == os.path.sep:
options.input = options.input[:-1]
......@@ -200,5 +214,6 @@ def main():
data_creator = SeqToSeqDatasetCreater(options.input, output_path)
data_creator.create_dataset(dictsize, options.mergeDict)
if __name__ == "__main__":
main();
main()
......@@ -50,16 +50,21 @@ def seq_to_seq_data(data_dir,
trg_dict = None
else:
train_list = os.path.join(data_dir, train_list)
test_list = os.path.join(data_dir,test_list)
test_list = os.path.join(data_dir, test_list)
define_py_data_sources2(train_list, test_list,
module = "dataprovider",
obj = "process",
args = {"src_dict": src_dict,
"trg_dict": trg_dict})
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict": src_dict,
"trg_dict": trg_dict})
return {"src_dict_path": src_lang_dict, "trg_dict_path": trg_lang_dict,
"gen_result": gen_result}
return {
"src_dict_path": src_lang_dict,
"trg_dict_path": trg_lang_dict,
"gen_result": gen_result
}
def gru_encoder_decoder(data_conf,
......@@ -90,51 +95,55 @@ def gru_encoder_decoder(data_conf,
size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size)
src_backward = simple_gru(input=src_embedding,
size=encoder_size,
reverse=True)
src_backward = simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj:
encoded_proj += full_matrix_projection(encoded_vector)
encoded_proj += full_matrix_projection(input=encoded_vector)
backward_first = first_seq(input=src_backward)
with mixed_layer(size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
decoder_boot += full_matrix_projection(backward_first)
with mixed_layer(
size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
decoder_boot += full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder',
size=decoder_size,
boot_layer=decoder_boot)
decoder_mem = memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = simple_attention(encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem, )
context = simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += full_matrix_projection(context)
decoder_inputs += full_matrix_projection(current_word)
gru_step = gru_step_layer(name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with mixed_layer(size=target_dict_dim,
bias_attr=True,
act=SoftmaxActivation()) as out:
decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with mixed_layer(
size=target_dict_dim, bias_attr=True,
act=SoftmaxActivation()) as out:
out += full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
StaticInput(input=encoded_proj,is_seq=True)]
group_inputs = [
StaticInput(
input=encoded_vector, is_seq=True), StaticInput(
input=encoded_proj, is_seq=True)
]
if not is_generating:
trg_embedding = embedding_layer(
input=data_layer(name='target_language_word',
size=target_dict_dim),
input=data_layer(
name='target_language_word', size=target_dict_dim),
size=word_vector_dim,
param_attr=ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
......@@ -144,12 +153,12 @@ def gru_encoder_decoder(data_conf,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = recurrent_group(name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
decoder = recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = data_layer(name='target_language_next_word',
size=target_dict_dim)
lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
cost = classification_cost(input=decoder, label=lbl)
outputs(cost)
else:
......@@ -168,16 +177,19 @@ def gru_encoder_decoder(data_conf,
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = beam_search(name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
seqtext_printer_evaluator(input=beam_gen,
id_input=data_layer(name="sent_id", size=1),
dict_file=trg_dict_path,
result_file=gen_trans_file)
beam_gen = beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
seqtext_printer_evaluator(
input=beam_gen,
id_input=data_layer(
name="sent_id", size=1),
dict_file=trg_dict_path,
result_file=gen_trans_file)
outputs(beam_gen)
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
此差异已折叠。
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(
train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 1
settings(
learning_method=MomentumOptimizer(),
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-4),
average_window=0.5,
learning_rate=1e-1,
learning_rate_decay_a=1e-5,
learning_rate_decay_b=0.25, )
num_label_types = 23
def get_simd_size(size):
return int(math.ceil(float(size) / 8)) * 8
# Currently, in order to use sparse_update=True,
# the size has to be aligned.
num_label_types = get_simd_size(num_label_types)
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk", size=num_label_types)
crf_input = fc_layer(
input=features,
size=num_label_types,
act=LinearActivation(),
bias_attr=False,
param_attr=ParamAttr(
initial_std=0, sparse_update=True))
crf = crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(
name="crfw", initial_std=0), )
crf_decoding = crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"), )
sum_evaluator(
name="error",
input=crf_decoding, )
chunk_evaluator(
name="chunk_f1",
input=[crf_decoding, chunk],
chunk_scheme="IOB",
num_chunk_types=11, )
inputs(word, pos, chunk, features)
outputs(crf)
# Sequence Tagging
This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
## Download data
```bash
cd demo/sequence_tagging
./data/get_data.sh
```
## Train model
```bash
cd demo/sequence_tagging
./train.sh
```
## Model description
We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
<center>
<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
<thead>
<th scope="col" class="left">Model name</th>
<th scope="col" class="left">Number of parameters</th>
<th scope="col" class="left">F1 score</th>
</thead>
<tbody>
<tr>
<td class="left">linear_crf</td>
<td class="left"> 1.8M </td>
<td class="left"> 0.937</td>
</tr>
<tr>
<td class="left">rnn_crf</td>
<td class="left"> 960K </td>
<td class="left">0.941</td>
</tr>
</tbody>
</table>
</center>
<br>
此差异已折叠。
#!/bin/bash
paddle train \
--config rnn_crf.py \
--parallel_nn=1 \
--use_gpu=1 \
--dot_period=10 \
--log_period=1000 \
--test_period=0 \
--num_passes=10
#!/bin/bash
paddle train \
--config linear_crf.py \
--use_gpu=0 \
--dot_period=100 \
--log_period=10000 \
--test_period=0 \
--num_passes=10
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册