提交 05dc854c 编写于 作者: X xiemoyuan

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize_error_message

...@@ -63,8 +63,29 @@ if(WIN32) ...@@ -63,8 +63,29 @@ if(WIN32)
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif()
endforeach(flag_var)
endif() endif()
# windows build turn off warnings.
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
message(STATUS "Using parallel compiling (/MP)") message(STATUS "Using parallel compiling (/MP)")
......
...@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG CRYPTOPP_8_2_0) ...@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG CRYPTOPP_8_2_0)
IF(WIN32) IF(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE) SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
ELSE(WIN32) ELSE(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE) SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32) ENDIF(WIN32)
set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS} set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
...@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS} ...@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
-DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
-DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name) ...@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
endif() endif()
endmacro() endmacro()
macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
if (BUILD_SHARED_LIBS)
return() # if build shared libs, the flags keep same with '/MD'
endif(BUILD_SHARED_LIBS)
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endmacro()
CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
if(NOT UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS)
...@@ -229,20 +215,3 @@ endforeach() ...@@ -229,20 +215,3 @@ endforeach()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
if(WIN32)
# windows build turn off warnings.
if(MSVC_STATIC_CRT)
safe_set_static_flag()
endif()
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var)
endif()
...@@ -24,7 +24,7 @@ set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_d ...@@ -24,7 +24,7 @@ set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_d
# so the generation of static lib is temporarily turned off. # so the generation of static lib is temporarily turned off.
if(WIN32) if(WIN32)
#todo: remove the option #todo: remove the option
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic." OFF)
if(NOT PYTHON_EXECUTABLE) if(NOT PYTHON_EXECUTABLE)
FIND_PACKAGE(PythonInterp REQUIRED) FIND_PACKAGE(PythonInterp REQUIRED)
endif() endif()
...@@ -165,25 +165,22 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR}) ...@@ -165,25 +165,22 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32) if(WIN32)
if(WITH_STATIC_LIB) if(WITH_STATIC_LIB)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
else() else()
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib) ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
endif() endif()
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
else(WIN32) else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
endif(WIN32) copy(inference_lib_dist
if(WIN32 AND NOT WITH_STATIC_LIB)
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
else()
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib} SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
endif() endif(WIN32)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
...@@ -211,12 +208,12 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps}) ...@@ -211,12 +208,12 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid") set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
set(module "inference") set(module "inference")
if(WIN32 AND NOT WITH_STATIC_LIB) if(WIN32)
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib} SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
) )
else() else()
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib} SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
......
...@@ -127,7 +127,8 @@ function(op_library TARGET) ...@@ -127,7 +127,8 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op") "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
"fused_bn_add_activation_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -44,10 +44,11 @@ add_subdirectory(api) ...@@ -44,10 +44,11 @@ add_subdirectory(api)
set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
zero_copy_tensor reset_tensor_array zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
if(WIN32) # TODO(xingzhaolong, jiweibo): remove this and create_static_lib(paddle_fluid) on windows GPU
if(WIN32 AND WITH_GPU)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API})
else() else()
create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API})
endif() endif()
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
......
...@@ -1048,6 +1048,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) { ...@@ -1048,6 +1048,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
const AnalysisConfig &config) { const AnalysisConfig &config) {
LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>( return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config); config);
} }
......
...@@ -373,6 +373,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -373,6 +373,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
const NativeConfig &config) { const NativeConfig &config) {
LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
} }
......
...@@ -51,8 +51,8 @@ if (WIN32) ...@@ -51,8 +51,8 @@ if (WIN32)
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
safe_set_static_flag()
if (WITH_STATIC_LIB) if (WITH_STATIC_LIB)
safe_set_static_flag()
add_definitions(-DSTATIC_LIB) add_definitions(-DSTATIC_LIB)
endif() endif()
endif() endif()
...@@ -136,7 +136,7 @@ else() ...@@ -136,7 +136,7 @@ else()
set(DEPS ${DEPS} set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB} ${MATH_LIB} ${MKLDNN_LIB}
glog gflags_static libprotobuf xxhash ${EXTERNAL_LIB}) glog gflags_static libprotobuf xxhash ${EXTERNAL_LIB})
set(DEPS ${DEPS} libcmt shlwapi.lib) set(DEPS ${DEPS} shlwapi.lib)
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
......
...@@ -6,7 +6,7 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode ...@@ -6,7 +6,7 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
DATA_DIR=$4 # dataset DATA_DIR=$4 # dataset
TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
MSVC_STATIC_CRT=$7
inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
cd `dirname $0` cd `dirname $0`
...@@ -66,43 +66,54 @@ mkdir -p build ...@@ -66,43 +66,54 @@ mkdir -p build
cd build cd build
rm -rf * rm -rf *
if [ $(echo `uname` | grep "Win") != "" ]; then for WITH_STATIC_LIB in ON OFF; do
# -----simple_on_word2vec on windows----- if [ $(echo `uname` | grep "Win") != "" ]; then
cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \ # TODO(xingzhaolong, jiweibo): remove this if windows GPU library is ready.
-DWITH_MKL=$TURN_ON_MKL \ if [ $TEST_GPU_CPU == ON] && [ $WITH_STATIC_LIB ==ON ]; then
-DDEMO_NAME=simple_on_word2vec \ return 0
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=OFF
msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
Release/simple_on_word2vec.exe \
--dirname=$DATA_DIR/word2vec/word2vec.inference.model \
--use_gpu=False
if [ $? -ne 0 ]; then
echo "simple_on_word2vec demo runs fail."
exit 1
fi
# -----vis_demo on windows-----
rm -rf *
cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=vis_demo \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=OFF
msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
for vis_demo_name in $vis_demo_list; do
Release/vis_demo.exe \
--modeldir=$DATA_DIR/$vis_demo_name/model \
--data=$DATA_DIR/$vis_demo_name/data.txt \
--refer=$DATA_DIR/$vis_demo_name/result.txt \
--use_gpu=False
if [ $? -ne 0 ]; then
echo "vis demo $vis_demo_name runs fail."
exit 1
fi fi
done
else # -----simple_on_word2vec on windows-----
for WITH_STATIC_LIB in ON OFF; do cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=simple_on_word2vec \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
for use_gpu in $use_gpu_list; do
Release/simple_on_word2vec.exe \
--dirname=$DATA_DIR/word2vec/word2vec.inference.model \
--use_gpu=$use_gpu
if [ $? -ne 0 ]; then
echo "simple_on_word2vec demo runs fail."
exit 1
fi
done
# -----vis_demo on windows-----
rm -rf *
cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=vis_demo \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
for use_gpu in $use_gpu_list; do
for vis_demo_name in $vis_demo_list; do
Release/vis_demo.exe \
--modeldir=$DATA_DIR/$vis_demo_name/model \
--data=$DATA_DIR/$vis_demo_name/data.txt \
--refer=$DATA_DIR/$vis_demo_name/result.txt \
--use_gpu=$use_gpu
if [ $? -ne 0 ]; then
echo "vis demo $vis_demo_name runs fail."
exit 1
fi
done
done
else
# -----simple_on_word2vec on linux/mac----- # -----simple_on_word2vec on linux/mac-----
rm -rf * rm -rf *
cmake .. -DPADDLE_LIB=${inference_install_dir} \ cmake .. -DPADDLE_LIB=${inference_install_dir} \
...@@ -123,7 +134,6 @@ else ...@@ -123,7 +134,6 @@ else
fi fi
done done
fi fi
# ---------vis_demo on linux/mac--------- # ---------vis_demo on linux/mac---------
rm -rf * rm -rf *
cmake .. -DPADDLE_LIB=${inference_install_dir} \ cmake .. -DPADDLE_LIB=${inference_install_dir} \
...@@ -145,7 +155,6 @@ else ...@@ -145,7 +155,6 @@ else
fi fi
done done
done done
# --------tensorrt mobilenet on linux/mac------ # --------tensorrt mobilenet on linux/mac------
if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
rm -rf * rm -rf *
...@@ -167,6 +176,6 @@ else ...@@ -167,6 +176,6 @@ else
exit 1 exit 1
fi fi
fi fi
done fi
fi done
set +x set +x
...@@ -17,11 +17,7 @@ ...@@ -17,11 +17,7 @@
#if defined(_WIN32) #if defined(_WIN32)
#ifndef PD_INFER_DECL #ifndef PD_INFER_DECL
#ifdef PADDLE_DLL_INFERENCE #ifdef PADDLE_DLL_INFERENCE
#ifndef PADDLE_ON_INFERENCE
#define PD_INFER_DECL
#else
#define PD_INFER_DECL __declspec(dllexport) #define PD_INFER_DECL __declspec(dllexport)
#endif // PADDLE_ON_INFERENCE
#else #else
#define PD_INFER_DECL __declspec(dllimport) #define PD_INFER_DECL __declspec(dllimport)
#endif // PADDLE_DLL_INFERENCE #endif // PADDLE_DLL_INFERENCE
......
...@@ -131,7 +131,9 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config, ...@@ -131,7 +131,9 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
input_names.size(), in_size, input_names.size(), in_size,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The number of input and the number of model's input must match.")); "The number of input and the number of model's input must match. The "
"number of input is %d, the number of model's input is %d.",
input_names.size(), in_size));
for (int i = 0; i < in_size; ++i) { for (int i = 0; i < in_size; ++i) {
auto input_t = predictor->GetInputTensor(inputs[i].name); auto input_t = predictor->GetInputTensor(inputs[i].name);
std::vector<int> tensor_shape; std::vector<int> tensor_shape;
......
...@@ -47,7 +47,9 @@ void Init(const std::vector<std::string> argv) { ...@@ -47,7 +47,9 @@ void Init(const std::vector<std::string> argv) {
void ReadBinaryFile(const std::string& filename, std::string* contents) { void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename); PADDLE_ENFORCE_EQ(
fin.is_open(), true,
platform::errors::Unavailable("Failed to open file %s.", filename));
fin.seekg(0, std::ios::end); fin.seekg(0, std::ios::end);
contents->clear(); contents->clear();
contents->resize(fin.tellg()); contents->resize(fin.tellg());
...@@ -133,9 +135,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor, ...@@ -133,9 +135,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str)); new framework::ProgramDesc(program_desc_str));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), PADDLE_ENFORCE_EQ(
"model version %ld is not supported.", framework::IsProgramVersionSupported(main_program->Version()), true,
main_program->Version()); platform::errors::Unavailable("Model version %ld is not supported.",
main_program->Version()));
// model_from_memory is false in separate parameters. // model_from_memory is false in separate parameters.
LoadPersistables(executor, scope, *main_program, dirname, "", LoadPersistables(executor, scope, *main_program, dirname, "",
...@@ -151,9 +154,10 @@ std::unique_ptr<framework::ProgramDesc> Load( ...@@ -151,9 +154,10 @@ std::unique_ptr<framework::ProgramDesc> Load(
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str)); new framework::ProgramDesc(program_desc_str));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), PADDLE_ENFORCE_EQ(
"model version %ld is not supported.", framework::IsProgramVersionSupported(main_program->Version()), true,
main_program->Version()); platform::errors::Unavailable("Model version %ld is not supported.",
main_program->Version()));
LoadPersistables(executor, scope, *main_program, "", param_filename, LoadPersistables(executor, scope, *main_program, "", param_filename,
false /* model_from_memory */); false /* model_from_memory */);
...@@ -165,9 +169,10 @@ std::unique_ptr<framework::ProgramDesc> LoadFromMemory( ...@@ -165,9 +169,10 @@ std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
const std::string& prog_buffer, const std::string& param_buffer) { const std::string& prog_buffer, const std::string& param_buffer) {
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(prog_buffer)); new framework::ProgramDesc(prog_buffer));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), PADDLE_ENFORCE_EQ(
"model version %ld is not supported.", framework::IsProgramVersionSupported(main_program->Version()), true,
main_program->Version()); platform::errors::Unavailable("Model version %ld is not supported.",
main_program->Version()));
LoadPersistables(executor, scope, *main_program, "", param_buffer, LoadPersistables(executor, scope, *main_program, "", param_buffer,
true /* model_filename */); true /* model_filename */);
......
...@@ -27,8 +27,8 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, ...@@ -27,8 +27,8 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
Has(plugin_type), true, Has(plugin_type), true,
platform::errors::NotFound( platform::errors::NotFound("TensorRT plugin type `%s` does not exists.",
"trt plugin type %s does not exists, check it.", plugin_type)); plugin_type));
auto plugin = plugin_registry_[plugin_type](serial_data, serial_length); auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
owned_plugins_.emplace_back(plugin); owned_plugins_.emplace_back(plugin);
......
...@@ -103,12 +103,11 @@ struct Serializer<std::vector<T>, ...@@ -103,12 +103,11 @@ struct Serializer<std::vector<T>,
DeserializeValue(buffer, buffer_size, &size); DeserializeValue(buffer, buffer_size, &size);
value->resize(size); value->resize(size);
size_t nbyte = value->size() * sizeof(T); size_t nbyte = value->size() * sizeof(T);
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(*buffer_size, nbyte,
*buffer_size, nbyte, platform::errors::InvalidArgument(
platform::errors::InvalidArgument("Expect buffer size >= value size in " "Insufficient data in buffer, expect contains %d "
"trt plugin deserialization, but got " "byte, but actually only contains %d byte.",
"buffer size = %d, value size = %d.", *buffer_size, nbyte));
*buffer_size, nbyte));
std::memcpy(value->data(), *buffer, nbyte); std::memcpy(value->data(), *buffer, nbyte);
reinterpret_cast<char const*&>(*buffer) += nbyte; reinterpret_cast<char const*&>(*buffer) += nbyte;
*buffer_size -= nbyte; *buffer_size -= nbyte;
......
...@@ -46,7 +46,9 @@ struct Registry { ...@@ -46,7 +46,9 @@ struct Registry {
template <typename ItemChild> template <typename ItemChild>
void Register(const std::string& name) { void Register(const std::string& name) {
PADDLE_ENFORCE_EQ(items_.count(name), 0); PADDLE_ENFORCE_EQ(items_.count(name), 0,
platform::errors::AlreadyExists(
"Item `%s` has beed registered.", name));
items_[name] = new ItemChild; items_[name] = new ItemChild;
} }
......
...@@ -69,12 +69,18 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -69,12 +69,18 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("alpha", "The scale of Original Embedding.") AddAttr<float>("alpha", "The scale of Original Embedding.")
.SetDefault(1.0f) .SetDefault(1.0f)
.AddCustomChecker([](const float& alpha) { .AddCustomChecker([](const float& alpha) {
PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0."); PADDLE_ENFORCE_GE(
alpha, 0.0f,
platform::errors::InvalidArgument(
"Attribute 'alpha' must be greater than or equal to 0.0."));
}); });
AddAttr<float>("beta", "The scale of Position Embedding.") AddAttr<float>("beta", "The scale of Position Embedding.")
.SetDefault(1.0f) .SetDefault(1.0f)
.AddCustomChecker([](const float& beta) { .AddCustomChecker([](const float& beta) {
PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0."); PADDLE_ENFORCE_GE(
beta, 0.0f,
platform::errors::InvalidArgument(
"Attribute 'beta' must be greater than or equal to 0.0."));
}); });
AddComment(R"DOC( AddComment(R"DOC(
Add Position Encoding Operator. Add Position Encoding Operator.
......
...@@ -76,7 +76,10 @@ class AssignValueKernel : public framework::OpKernel<T> { ...@@ -76,7 +76,10 @@ class AssignValueKernel : public framework::OpKernel<T> {
value_name = "int64_values"; value_name = "int64_values";
break; break;
default: default:
PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported data type(code %d) for AssignValue operator, only "
"supports bool, int32, float32 and int64.",
dtype));
break; break;
} }
CopyVecotorToTensor<T>(value_name, out, ctx); CopyVecotorToTensor<T>(value_name, out, ctx);
......
...@@ -33,29 +33,37 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -33,29 +33,37 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
auto out_vars = context.MultiOutputVar("Output"); auto out_vars = context.MultiOutputVar("Output");
PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0), PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
"The CoalesceTensorOp has no input."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ( "The CoalesceTensor operator has no input."));
in_var_names.size(), out_var_names.size(), PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size(),
"The number of CoalesceTensorOp's input and output is not match."); platform::errors::InvalidArgument(
"The number of CoalesceTensor operator's input and "
"output is not match, "
"input number is %u, output number is %u.",
in_var_names.size(), out_var_names.size()));
// Input & Output check: only support LoDTensor // Input & Output check: only support LoDTensor
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
in_vars[i], in_vars[i],
"The input variable %s of CoalesceTensorOp does not exist.", platform::errors::NotFound("The input variable %s of CoalesceTensor "
in_var_names[i]); "operator does not exist.",
in_var_names[i]));
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
out_vars[i], out_vars[i],
"The output variable %s of CoalesceTensorOp does not exist.", platform::errors::NotFound("The output variable %s of CoalesceTensor "
out_var_names[i]); "operator does not exist.",
PADDLE_ENFORCE_EQ( out_var_names[i]));
in_vars[i]->IsType<framework::LoDTensor>(), true, PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensor>(), true,
"The input variable %s of CoalesceTensorOp is not LoDTensor.", platform::errors::InvalidArgument(
in_var_names[i]); "The input variable %s of CoalesceTensor operator "
PADDLE_ENFORCE_EQ( "is not LoDTensor.",
out_vars[i]->IsType<framework::LoDTensor>(), true, in_var_names[i]));
"The output variable %s of CoalesceTensorOp is not LoDTensor.", PADDLE_ENFORCE_EQ(out_vars[i]->IsType<framework::LoDTensor>(), true,
in_var_names[i]); platform::errors::InvalidArgument(
"The output variable %s of CoalesceTensor operator "
"is not LoDTensor.",
in_var_names[i]));
} }
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input"); auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
...@@ -64,7 +72,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -64,7 +72,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < in_var_names.size(); ++i) { for (size_t i = 0; i < in_var_names.size(); ++i) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_names[i], out_var_names[i], in_var_names[i], out_var_names[i],
"The input and output variable of CoalesceTensorOp is different."); platform::errors::InvalidArgument(
"The input and output variable of CoalesceTensor operator is "
"different, %dth input is %s, %dth output is %s.",
i, in_var_names[i], i, out_var_names[i]));
} }
} else { } else {
// Init the output as input // Init the output as input
...@@ -134,16 +145,25 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> { ...@@ -134,16 +145,25 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
const std::vector<const framework::LoDTensor *> &lod_tensors, const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel, const std::vector<std::string> var_names, size_t *numel,
const size_t &size_of_dtype, const platform::Place &place) const { const size_t &size_of_dtype, const platform::Place &place) const {
PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); PADDLE_ENFORCE_EQ(
lod_tensors.size(), var_names.size(),
platform::errors::InvalidArgument(
"The number of input tensor and variable does not match, the "
"number of input tensor is %u, the number of input variable is %u.",
lod_tensors.size(), var_names.size()));
*numel = 0; *numel = 0;
std::stringstream ss; std::stringstream ss;
ss << "alloc_space_for_vars: "; ss << "alloc_space_for_vars: ";
for (size_t i = 0; i < var_names.size(); ++i) { for (size_t i = 0; i < var_names.size(); ++i) {
PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true, PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
"%s is not initialized.", var_names[i]); platform::errors::InvalidArgument(
"Tensor `%s` is not initialized.", var_names[i]));
auto size = lod_tensors[i]->numel(); auto size = lod_tensors[i]->numel();
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(
size, 0,
platform::errors::InvalidArgument(
"The number of tensor `%s`'s elements is 0.", var_names[i]));
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< ") " << ") "
<< " addres:" << lod_tensors[i]->data<void>() << ", "; << " addres:" << lod_tensors[i]->data<void>() << ", ";
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/operators/concat_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -78,7 +79,8 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -78,7 +79,8 @@ class ConcatOp : public framework::OperatorWithKernel {
} }
} }
if (flag == 0) { if (flag == 0) {
PADDLE_THROW("All Inputs of Concat OP are Empty!"); PADDLE_THROW(platform::errors::InvalidArgument(
"All Inputs of Concat OP are Empty!"));
} }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (platform::CanMKLDNNBeUsed(ctx)) { if (platform::CanMKLDNNBeUsed(ctx)) {
......
...@@ -162,7 +162,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -162,7 +162,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
workspace_size = GetWorkspaceSize(args, algo); workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) { if (workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
workspace_size_limit = workspace_size; workspace_size_limit = workspace_size;
#else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#endif
} }
#else #else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -291,8 +304,23 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -291,8 +304,23 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
#endif #endif
workspace_size = GetWorkspaceSize(args, algo); workspace_size = GetWorkspaceSize(args, algo);
if (workspace_size > workspace_size_limit) { if (workspace_size > workspace_size_limit) {
workspace_size_limit = workspace_size;
has_got_workspace_size = false; has_got_workspace_size = false;
#if CUDNN_VERSION >= 8000
// There is no cudnnGetConvolutionBackwardDataAlgorithm in CUDNN 8
// version.
workspace_size_limit = workspace_size;
#else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#endif
} }
#else #else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
......
...@@ -45,10 +45,8 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel { ...@@ -45,10 +45,8 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DequantizeMaxAbs");
"Input(X) of DequantizeMaxAbsOp should not be null."); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DequantizeMaxAbs");
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
"Output(Out) of DequantizeMaxAbsOp should not be null.");
ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
......
...@@ -532,7 +532,8 @@ static int count_contours(polygon_node *polygon) { ...@@ -532,7 +532,8 @@ static int count_contours(polygon_node *polygon) {
} }
static void add_left(polygon_node *p, double x, double y) { static void add_left(polygon_node *p, double x, double y) {
PADDLE_ENFORCE_NOT_NULL(p); PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
"Input polygon node is nullptr."));
vertex_node *nv = NULL; vertex_node *nv = NULL;
/* Create a new vertex node and set its fields */ /* Create a new vertex node and set its fields */
...@@ -588,7 +589,8 @@ static void add_right(polygon_node *p, double x, double y) { ...@@ -588,7 +589,8 @@ static void add_right(polygon_node *p, double x, double y) {
} }
static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) { static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
PADDLE_ENFORCE_NOT_NULL(p); PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
"Input polygon node is nullptr."));
polygon_node *target = NULL; polygon_node *target = NULL;
/* Label contour as external */ /* Label contour as external */
...@@ -664,7 +666,8 @@ void add_vertex(vertex_node **t, double x, double y) { ...@@ -664,7 +666,8 @@ void add_vertex(vertex_node **t, double x, double y) {
} }
void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) { void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
PADDLE_ENFORCE_NOT_NULL(e); PADDLE_ENFORCE_NOT_NULL(e, paddle::platform::errors::InvalidArgument(
"Input edge node is nullptr."));
add_vertex(&(e->outp[p]->v[s]), x, y); add_vertex(&(e->outp[p]->v[s]), x, y);
e->outp[p]->active++; e->outp[p]->active++;
} }
...@@ -693,7 +696,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) { ...@@ -693,7 +696,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox), gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
const_cast<char *>("Bounding box creation")); const_cast<char *>("Bounding box creation"));
PADDLE_ENFORCE_NOT_NULL(box); PADDLE_ENFORCE_NOT_NULL(box, paddle::platform::errors::ResourceExhausted(
"Failed to malloc box memory."));
/* Construct contour bounding boxes */ /* Construct contour bounding boxes */
for (c = 0; c < p->num_contours; c++) { for (c = 0; c < p->num_contours; c++) {
...@@ -857,7 +861,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { ...@@ -857,7 +861,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
/* Create an extended hole array */ /* Create an extended hole array */
gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int), gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
const_cast<char *>("contour hole addition")); const_cast<char *>("contour hole addition"));
PADDLE_ENFORCE_NOT_NULL(extended_hole); PADDLE_ENFORCE_NOT_NULL(extended_hole,
paddle::platform::errors::ResourceExhausted(
"Failed to malloc extended hole memory."));
/* Create an extended contour array */ /* Create an extended contour array */
gpc_malloc<gpc_vertex_list>(extended_contour, gpc_malloc<gpc_vertex_list>(extended_contour,
...@@ -975,7 +981,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -975,7 +981,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
/* Build scanbeam table from scanbeam tree */ /* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double), gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation")); const_cast<char *>("sbt creation"));
PADDLE_ENFORCE_NOT_NULL(sbt); PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
"Failed to malloc scanbeam table memory."));
build_sbt(&scanbeam, sbt, sbtree); build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0; scanbeam = 0;
free_sbtree(&sbtree); free_sbtree(&sbtree);
...@@ -1017,7 +1025,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -1017,7 +1025,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
e0 = aet; e0 = aet;
e1 = aet; e1 = aet;
/* Set up bundle fields of first edge */ /* Set up bundle fields of first edge */
PADDLE_ENFORCE_NOT_NULL(aet); PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
"Edge node AET is nullptr."));
aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
aet->bundle[ABOVE][!aet->type] = 0; aet->bundle[ABOVE][!aet->type] = 0;
aet->bstate[ABOVE] = UNBUNDLED; aet->bstate[ABOVE] = UNBUNDLED;
...@@ -1612,7 +1622,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -1612,7 +1622,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
/* Build scanbeam table from scanbeam tree */ /* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double), gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation")); const_cast<char *>("sbt creation"));
PADDLE_ENFORCE_NOT_NULL(sbt); PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
"Failed to malloc scanbeam table memory."));
build_sbt(&scanbeam, sbt, sbtree); build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0; scanbeam = 0;
free_sbtree(&sbtree); free_sbtree(&sbtree);
...@@ -1650,7 +1661,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -1650,7 +1661,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
e1 = aet; e1 = aet;
/* Set up bundle fields of first edge */ /* Set up bundle fields of first edge */
PADDLE_ENFORCE_NOT_NULL(aet); PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
"Edge node AET is nullptr."));
aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
aet->bundle[ABOVE][!aet->type] = 0; aet->bundle[ABOVE][!aet->type] = 0;
aet->bstate[ABOVE] = UNBUNDLED; aet->bstate[ABOVE] = UNBUNDLED;
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <tuple>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/diag_v2_op.h" #include "paddle/fluid/operators/diag_v2_op.h"
...@@ -58,6 +59,17 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> { ...@@ -58,6 +59,17 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
auto out_dims = out->dims(); auto out_dims = out->dims();
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto GetBlockGridSize = [&dev_ctx](int64_t size) {
const int64_t block_size =
std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (size + block_size - 1) / block_size);
return std::tuple<int64_t, int64_t>{block_size, grid_size};
};
if (x_dims.size() == 1) { if (x_dims.size() == 1) {
float padding_value = context.Attr<float>("padding_value"); float padding_value = context.Attr<float>("padding_value");
math::SetConstant<DeviceContext, T> set_padding_value; math::SetConstant<DeviceContext, T> set_padding_value;
...@@ -67,26 +79,23 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> { ...@@ -67,26 +79,23 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
auto size = (offset > 0) ? x_length + offset : x_length - offset; auto size = (offset > 0) ? x_length + offset : x_length - offset;
const int& x_stride = ComputeStride(0, x_dims); const int& x_stride = ComputeStride(0, x_dims);
if (size > 0) { if (size > 0) {
const int block_num = std::min(static_cast<int>(size),
dev_ctx.GetMaxPhysicalThreadCount());
int size_ = static_cast<int>(size);
int block_num_ = static_cast<int>(block_num);
const int grid_num =
std::min(1024, (size_ + block_num_ - 1) / block_num_);
const auto& out_stride_0 = ComputeStride(0, out_dims); const auto& out_stride_0 = ComputeStride(0, out_dims);
const auto& out_stride_1 = ComputeStride(1, out_dims); const auto& out_stride_1 = ComputeStride(1, out_dims);
auto start = auto start =
(offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0); (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
PasteDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>( std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
out_data, x_data, start, x_length, out_stride_0 + out_stride_1,
x_stride); PasteDiagonalKernel<
T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
dev_ctx.stream()>>>(out_data, x_data, start, x_length,
out_stride_0 + out_stride_1, x_stride);
} }
} else { } else {
const int& x_stride_0 = ComputeStride(0, x_dims); const int& x_stride_0 = ComputeStride(0, x_dims);
const int& x_stride_1 = ComputeStride(1, x_dims); const int& x_stride_1 = ComputeStride(1, x_dims);
int size; int64_t size;
if (offset > 0) { if (offset > 0) {
size = std::min(x_dims[0], x_dims[1] - offset); size = std::min(x_dims[0], x_dims[1] - offset);
} else { } else {
...@@ -94,18 +103,15 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> { ...@@ -94,18 +103,15 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
} }
if (size > 0) { if (size > 0) {
const int block_num = std::min(static_cast<int>(size),
dev_ctx.GetMaxPhysicalThreadCount());
int size_ = static_cast<int>(size);
int block_num_ = static_cast<int>(block_num);
const int grid_num =
std::min(1024, (size_ + block_num_ - 1) / block_num_);
auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0); auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
const auto& out_stride_0 = ComputeStride(0, out_dims); const auto& out_stride_0 = ComputeStride(0, out_dims);
ExtractDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>( std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
out_data, x_data, start, size, x_stride_0 + x_stride_1,
out_stride_0); ExtractDiagonalKernel<
T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
dev_ctx.stream()>>>(out_data, x_data, start, size,
x_stride_0 + x_stride_1, out_stride_0);
} }
} }
} }
......
...@@ -74,8 +74,12 @@ void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, ...@@ -74,8 +74,12 @@ void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
} else { } else {
recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
} }
InitParams();
} }
void AsyncCommunicator::InitParams() { RecvNoBarrier(); }
AsyncCommunicator::~AsyncCommunicator() { AsyncCommunicator::~AsyncCommunicator() {
running_ = false; running_ = false;
if (main_thread_) main_thread_->join(); if (main_thread_) main_thread_->join();
...@@ -157,16 +161,18 @@ void AsyncCommunicator::MainThread() { ...@@ -157,16 +161,18 @@ void AsyncCommunicator::MainThread() {
} }
while (running_) { while (running_) {
int meet = Meet(); int batches = BatchesCounter();
VLOG(1) << "async_meet: " << meet; if (batches > 0) {
SendGlobalStep(batches);
SendGlobalStep(meet); SendByCommunicator(batches);
SendByCommunicator(meet); BarrierSend();
BarrierSend(); RecvByCommunicator();
RecvByCommunicator(); BarrierRecv();
BarrierRecv(); BarrierWeakUp();
BarrierWeakUp(); } else {
VLOG(1) << "get nothing from sending queue, will skip send/recv";
}
} }
VLOG(1) << "communicator stopped, send thread exit"; VLOG(1) << "communicator stopped, send thread exit";
} }
...@@ -187,7 +193,7 @@ void AsyncCommunicator::RecvNoBarrier() { ...@@ -187,7 +193,7 @@ void AsyncCommunicator::RecvNoBarrier() {
auto &var_name = iter.first; auto &var_name = iter.first;
VLOG(4) << "recv var " << var_name; VLOG(4) << "recv var " << var_name;
auto recv_functor = distributed::ParameterRecv<float>(); auto recv_functor = distributed::ParameterRecv<float>();
recv_functor(iter.second, *recv_scope_, false); recv_functor(iter.second, *recv_scope_);
}; };
task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
} }
...@@ -197,7 +203,7 @@ void AsyncCommunicator::RecvNoBarrier() { ...@@ -197,7 +203,7 @@ void AsyncCommunicator::RecvNoBarrier() {
} }
} }
int AsyncCommunicator::Meet() { int AsyncCommunicator::BatchesCounter() {
auto &step_queue = send_varname_to_queue_.at(STEP_COUNTER); auto &step_queue = send_varname_to_queue_.at(STEP_COUNTER);
size_t merged_var_num = 0; size_t merged_var_num = 0;
...@@ -316,7 +322,7 @@ void HalfAsyncCommunicator::Clean() { ...@@ -316,7 +322,7 @@ void HalfAsyncCommunicator::Clean() {
} }
} }
int HalfAsyncCommunicator::Meet() { int HalfAsyncCommunicator::BatchesCounter() {
while (running_) { while (running_) {
if (barrier_counter_.load() >= barrier_trigger_.load() && if (barrier_counter_.load() >= barrier_trigger_.load() &&
barrier_trigger_.load() != 0) { barrier_trigger_.load() != 0) {
...@@ -443,7 +449,7 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, ...@@ -443,7 +449,7 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
old_scope_.reset(new Scope()); old_scope_.reset(new Scope());
pserver_scope_.reset(new Scope()); pserver_scope_.reset(new Scope());
Init(); InitParams();
} }
void GeoCommunicator::Send(const std::vector<std::string> &var_names, void GeoCommunicator::Send(const std::vector<std::string> &var_names,
...@@ -626,9 +632,7 @@ void GeoCommunicator::RecvByCommunicator() { ...@@ -626,9 +632,7 @@ void GeoCommunicator::RecvByCommunicator() {
if (recv_ctx.is_sparse) { if (recv_ctx.is_sparse) {
RecvSparse(var_name); RecvSparse(var_name);
} else { } else {
VLOG(1) << "recv dense " << var_name << " begin";
RecvDense(var_name); RecvDense(var_name);
VLOG(1) << "recv dense " << var_name << " done";
} }
}; };
tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task))); tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
...@@ -696,7 +700,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) { ...@@ -696,7 +700,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
auto &ctx = recv_varname_to_ctx_.at(varname); auto &ctx = recv_varname_to_ctx_.at(varname);
auto recv = distributed::ParameterRecv<float>(); auto recv = distributed::ParameterRecv<float>();
recv(ctx, *pserver_scope_, true); recv(ctx, *pserver_scope_);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
var_psrever->IsInitialized(), true, var_psrever->IsInitialized(), true,
...@@ -721,7 +725,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) { ...@@ -721,7 +725,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
t_timestamp->data<float>()); t_timestamp->data<float>());
} }
void GeoCommunicator::Init() { void GeoCommunicator::InitParams() {
std::vector<std::future<void>> tasks; std::vector<std::future<void>> tasks;
tasks.reserve(recv_varname_to_ctx_.size()); tasks.reserve(recv_varname_to_ctx_.size());
...@@ -744,12 +748,17 @@ void GeoCommunicator::Init() { ...@@ -744,12 +748,17 @@ void GeoCommunicator::Init() {
} }
void GeoCommunicator::InitDense(const std::string varname) { void GeoCommunicator::InitDense(const std::string varname) {
auto *var = old_scope_->Var(varname);
var->GetMutable<framework::LoDTensor>();
auto &ctx = recv_varname_to_ctx_.at(varname); auto &ctx = recv_varname_to_ctx_.at(varname);
auto recv = distributed::ParameterRecv<float>(); auto recv = distributed::ParameterRecv<float>();
recv(ctx, *old_scope_); recv(ctx, *recv_scope_);
auto *global_var = recv_scope_->FindVar(varname);
global_var->GetMutable<framework::LoDTensor>();
auto *old_var = old_scope_->Var(varname);
old_var->GetMutable<framework::LoDTensor>();
framework::CopyVariable(*global_var, old_var);
VLOG(1) << "init dense variable " << varname << " done"; VLOG(1) << "init dense variable " << varname << " done";
} }
...@@ -781,22 +790,41 @@ void GeoCommunicator::InitSparse() { ...@@ -781,22 +790,41 @@ void GeoCommunicator::InitSparse() {
LargeScaleKV::Init(metas); LargeScaleKV::Init(metas);
for (size_t i = 0; i < metas.size(); i++) { for (auto &meta : metas) {
auto &varname = metas[i].name; auto &ctx = recv_varname_to_ctx_.at(meta.name);
auto &dict = dicts[i]; auto recv = distributed::ParameterRecv<float>();
std::vector<int64_t> ids; auto *global_var = recv_scope_->FindVar(meta.name);
ids.reserve(dict); auto global_value = global_var->Get<framework::LoDTensor>();
auto rows = global_value.dims()[0];
auto dim1 = global_value.dims()[1];
for (auto j = 0; j < dict; ++j) { recv(ctx, *recv_scope_);
ids.push_back(j); VLOG(1) << "recv " << meta.name << " with global scope for init";
}
auto n_rows = global_var->Get<framework::LoDTensor>().dims()[0];
PADDLE_ENFORCE_EQ(
rows, n_rows,
platform::errors::InvalidArgument(
"global var: %s origin dim must equal recved rows", meta.name));
std::vector<int64_t> ids(rows);
std::iota(ids.begin(), ids.end(), 0);
auto *ins = distributed::LargeScaleKV::GetInstance(); auto *ins = distributed::LargeScaleKV::GetInstance();
ins->Get(varname)->Init(ids); std::vector<std::vector<std::vector<float> *>> values;
ins->Get(meta.name)->Init(ids);
ins->Get(meta.name)->Get(ids, {"Param"}, &values);
VLOG(3) << "GeoCommunicator init sparse " << varname << " with size " auto blas = math::GetBlas<platform::CPUDeviceContext, float>(
<< ids.size(); paddle::platform::CPUDeviceContext());
for (auto &id : ids) {
blas.VCOPY(dim1, global_value.data<float>() + id * dim1,
values[id][0]->data());
}
} }
VLOG(3) << "init sparse variable done"; VLOG(3) << "init sparse variable done";
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <deque> #include <deque>
#include <map> #include <map>
#include <memory> #include <memory>
#include <numeric>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -29,6 +30,7 @@ limitations under the License. */ ...@@ -29,6 +30,7 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/distributed/communicator_common.h" #include "paddle/fluid/operators/distributed/communicator_common.h"
#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/large_scale_kv.h" #include "paddle/fluid/operators/distributed/large_scale_kv.h"
...@@ -279,6 +281,8 @@ class AsyncCommunicator : public Communicator { ...@@ -279,6 +281,8 @@ class AsyncCommunicator : public Communicator {
const RpcCtxMap &recv_varname_to_ctx, const RpcCtxMap &recv_varname_to_ctx,
Scope *recv_scope) override; Scope *recv_scope) override;
void InitParams();
void MainThread(); void MainThread();
void Send(const std::vector<std::string> &var_names, void Send(const std::vector<std::string> &var_names,
...@@ -293,7 +297,7 @@ class AsyncCommunicator : public Communicator { ...@@ -293,7 +297,7 @@ class AsyncCommunicator : public Communicator {
virtual void RecvNoBarrier(); virtual void RecvNoBarrier();
virtual int Meet(); virtual int BatchesCounter();
virtual void BarrierSend() {} virtual void BarrierSend() {}
...@@ -350,7 +354,7 @@ class HalfAsyncCommunicator : public AsyncCommunicator { ...@@ -350,7 +354,7 @@ class HalfAsyncCommunicator : public AsyncCommunicator {
void BarrierTriggerReset(int initial_val) override; void BarrierTriggerReset(int initial_val) override;
int Meet(); int BatchesCounter();
void BarrierWeakUp(); void BarrierWeakUp();
...@@ -435,7 +439,7 @@ class GeoCommunicator : public AsyncCommunicator { ...@@ -435,7 +439,7 @@ class GeoCommunicator : public AsyncCommunicator {
void RecvDense(const std::string &varname); void RecvDense(const std::string &varname);
void Init(); void InitParams();
void InitSparse(); void InitSparse();
......
...@@ -41,8 +41,67 @@ using SelectedRows = framework::SelectedRows; ...@@ -41,8 +41,67 @@ using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim; using DDim = framework::DDim;
template <typename T> template <typename T>
void RecvSelectedRows(const CommContext &rpc_ctx, void RecvSparseLodTensor(const CommContext &rpc_ctx,
const framework::Scope &scope) { const framework::Scope &scope) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto cpu_place = platform::CPUPlace();
auto &cpu_ctx = *pool.Get(cpu_place);
distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
std::vector<const float *> tensors;
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
auto &recv_var_name = rpc_ctx.splited_varnames[i];
auto *local_var = local_scope->Var(recv_var_name);
VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
// sparse param in recv_scope is LoDTensor
rets.push_back(rpc_client->AsyncGetVarNoBarrier(
rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
recv_var_name));
const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
tensors.push_back(value);
}
for (size_t i = 0; i < rets.size(); i++) {
PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
"internal error in RPCClient"));
}
auto *merged_var = scope.FindVar(rpc_ctx.var_name);
if (merged_var == nullptr || !merged_var->IsInitialized()) {
PADDLE_THROW(
platform::errors::InvalidArgument("%s must initialized at first."));
}
auto dims1 = merged_var->Get<framework::LoDTensor>().dims()[1];
int64_t height = 0;
for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]);
height += splited_var->Get<framework::LoDTensor>().dims()[0];
}
PADDLE_ENFORCE_EQ(merged_var->Get<framework::LoDTensor>().dims()[0], height,
"recved var must has same dims with local var");
auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
auto *merged_d = merged_t->mutable_data<float>(cpu_place);
auto pserver_num = rpc_ctx.splited_varnames.size();
for (int x = 0; x < height; ++x) {
auto id = x % pserver_num;
auto idx = x / pserver_num;
std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1,
sizeof(float) * dims1);
}
}
template <typename T>
void RecvGeoSparseRecords(const CommContext &rpc_ctx,
const framework::Scope &scope) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto cpu_place = platform::CPUPlace(); auto cpu_place = platform::CPUPlace();
auto &cpu_ctx = *pool.Get(cpu_place); auto &cpu_ctx = *pool.Get(cpu_place);
...@@ -84,9 +143,14 @@ void RecvSelectedRows(const CommContext &rpc_ctx, ...@@ -84,9 +143,14 @@ void RecvSelectedRows(const CommContext &rpc_ctx,
ids_num += recv_t.rows().size(); ids_num += recv_t.rows().size();
width = recv_t.value().dims()[1]; width = recv_t.value().dims()[1];
std::transform(recv_t.rows().begin(), recv_t.rows().end(), if (rpc_ctx.is_distributed) {
std::back_inserter(all_ids), std::copy(recv_t.rows().begin(), recv_t.rows().end(),
[&](int64_t id) { return id * pserver_num + i; }); std::back_inserter(all_ids));
} else {
std::transform(recv_t.rows().begin(), recv_t.rows().end(),
std::back_inserter(all_ids),
[&](int64_t id) { return id * pserver_num + i; });
}
} }
auto *var = scope.FindVar(rpc_ctx.var_name); auto *var = scope.FindVar(rpc_ctx.var_name);
...@@ -146,7 +210,8 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { ...@@ -146,7 +210,8 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
template <typename T> template <typename T>
void ParameterRecv<T>::operator()(const CommContext &rpc_ctx, void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
const framework::Scope &scope, bool barrier) { const framework::Scope &scope,
bool geo_records) {
VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name; VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1, PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1,
...@@ -154,18 +219,21 @@ void ParameterRecv<T>::operator()(const CommContext &rpc_ctx, ...@@ -154,18 +219,21 @@ void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
"origin_varnames.size() >= 1 is permitted")); "origin_varnames.size() >= 1 is permitted"));
if (rpc_ctx.is_sparse) { if (rpc_ctx.is_sparse) {
RecvSelectedRows<T>(rpc_ctx, scope); if (geo_records) {
RecvGeoSparseRecords<T>(rpc_ctx, scope);
} else {
RecvSparseLodTensor<T>(rpc_ctx, scope);
}
} else { } else {
RecvLodTensor<T>(rpc_ctx, scope); RecvLodTensor<T>(rpc_ctx, scope);
} }
VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
} }
template <typename T> template <typename T>
void ParameterRecv<T>::operator()(const CommContext &rpc_ctx, void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
const framework::Scope &scope) { const framework::Scope &scope) {
this->operator()(rpc_ctx, scope, true); this->operator()(rpc_ctx, scope, false);
} }
template struct ParameterRecv<float>; template struct ParameterRecv<float>;
......
...@@ -48,7 +48,9 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -48,7 +48,9 @@ class FetchBarrierOp : public framework::OperatorBase {
} }
for (size_t i = 0; i < rets.size(); i++) { for (size_t i = 0; i < rets.size(); i++) {
PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient"); PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
platform::errors::Unavailable(
"Internal error occurred in RPCClient."));
} }
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h"
#include <string>
namespace paddle {
namespace operators {
class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of LargeScaleFuseAdamOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("LearningRate"),
"Input(LearningRate) of LargeScaleFuseAdamOp should not be null.");
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
return framework::OpKernelType(data_type, ctx.device_context());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (var_name == "LearningRate") {
return framework::OpKernelType(tensor.type(), tensor.place(),
tensor.layout());
}
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
};
class LargeScaleFuseAdamOpInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
auto in_var_type = ctx->GetInputType("Grad");
PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
in_var_type == framework::proto::VarType::LOD_TENSOR,
true, platform::errors::InvalidArgument(
"The input Var's type should be LoDtensor or "
"SelectedRows, but the received type is %s",
in_var_type));
}
};
class LargeScaleFuseAdamOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Grad",
"(SelectedRows) Ids's type should be SelectedRows"
"THe ids to be looked up in W.");
AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
AddInput("LearningRate", "(Tensor) Learning rate of SGD");
AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
AddAttr<float>("beta1",
"(float, default 0.9) "
"Exponential decay rate for the "
"first moment estimates.")
.SetDefault(0.9f);
AddAttr<float>("beta2",
"(float, default 0.999) "
"exponential decay rate for the "
"second moment estimates.")
.SetDefault(0.999f);
AddAttr<float>("epsilon",
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddAttr<bool>("is_entry",
"(bool)"
"sparse table need entry");
AddAttr<std::string>("tablename",
"(string)"
"sparse table name");
AddAttr<std::vector<std::string>>("value_names",
"(strings)"
"sparse table name");
AddComment(R"DOC(
Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates:
$$
moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
learning\_rate = learning\_rate *
\frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
lookup_sparse_table_fuse_adam, ops::LargeScaleFuseAdamOp,
ops::LargeScaleFuseAdamOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::LargeScaleFuseAdamOpInferVarType);
REGISTER_OP_CPU_KERNEL(
lookup_sparse_table_fuse_adam,
ops::LargeScaleFuseAdamOpKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h> // for sqrt in CPU and CUDA
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/large_scale_kv.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LargeScaleFuseAdamOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override;
};
template <typename T>
class LargeScaleFuseAdamOpKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using paddle::framework::LoDTensor;
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(
grad_var->IsType<framework::SelectedRows>(),
platform::errors::InvalidArgument(
"in large scale optimize, gradient should only be SelectedRows"));
const auto &grad = grad_var->Get<framework::SelectedRows>();
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad.rows().size() == 0) {
return;
}
framework::SelectedRows tmp_grad_merge;
const framework::SelectedRows *grad_merge_ptr;
math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
&tmp_grad_merge, true);
grad_merge_ptr = &tmp_grad_merge;
std::vector<int64_t> in_rows;
in_rows.reserve(grad_merge_ptr->rows().size());
std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
std::back_inserter(in_rows));
const auto *lr = learning_rate->data<T>();
auto grad_v = grad_merge_ptr->value();
auto grad_width = grad_v.dims()[1];
// auto is_entry = context.Attr<bool>("is_entry");
auto tablename = ctx.Attr<std::string>("tablename");
auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
auto *beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
auto *beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
auto *beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
auto *beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
platform::errors::InvalidArgument(
"beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
platform::errors::InvalidArgument(
"beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
// update beta1 and beta2
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
std::vector<std::vector<std::vector<float> *>> values;
std::vector<int64_t> dims;
auto *ins = distributed::LargeScaleKV::GetInstance();
auto *table = ins->Get(tablename);
table->Get(in_rows, value_names, &values);
table->Dims({"Param"}, &dims);
PADDLE_ENFORCE_EQ(dims[0], grad_width,
platform::errors::InvalidArgument(
"param_row should have the same size with grad_row"));
T lr_ = lr[0];
T beta1_pow_ = beta1_pow->data<T>()[0];
T beta2_pow_ = beta2_pow->data<T>()[0];
lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
for (size_t i = 0; i < in_rows.size(); i++) {
auto &params = values[i][0];
auto &moment_1 = values[i][1];
auto &moment_2 = values[i][2];
auto *p_data = params->data();
auto *m1_data = moment_1->data();
auto *m2_data = moment_2->data();
for (int x = 0; x < grad_width; ++x) {
auto g = grad_v.data<T>()[grad_width * i + x];
m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g;
m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g;
p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon));
}
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h"
#include <string>
namespace paddle {
namespace operators {
class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of LargeScaleFuseSGDOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("LearningRate"),
"Input(LearningRate) of LargeScaleFuseSGDOp should not be null.");
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
return framework::OpKernelType(data_type, ctx.device_context());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (var_name == "LearningRate") {
return framework::OpKernelType(tensor.type(), tensor.place(),
tensor.layout());
}
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
};
class LargeScaleFuseSGDOpInferVarType : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
auto in_var_type = ctx->GetInputType("Grad");
PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
in_var_type == framework::proto::VarType::LOD_TENSOR,
true, platform::errors::InvalidArgument(
"The input Var's type should be LoDtensor or "
"SelectedRows, but the received type is %s",
in_var_type));
}
};
class LargeScaleFuseSGDOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Grad",
"(SelectedRows) Ids's type should be SelectedRows"
"THe ids to be looked up in W.");
AddInput("LearningRate", "(Tensor) Learning rate of SGD");
AddAttr<bool>("is_entry",
"(bool)"
"sparse table need entry");
AddAttr<std::string>("tablename",
"(string)"
"sparse table name");
AddAttr<std::vector<std::string>>("value_names",
"(strings)"
"sparse table name");
AddComment(R"DOC(
LargeScaleFuseSGD operator
This operator implements one step of the stochastic gradient descent algorithm.
$$param\_out = param - learning\_rate * grad$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
lookup_sparse_table_fuse_sgd, ops::LargeScaleFuseSGDOp,
ops::LargeScaleFuseSGDOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::LargeScaleFuseSGDOpInferVarType);
REGISTER_OP_CPU_KERNEL(
lookup_sparse_table_fuse_sgd,
ops::LargeScaleFuseSGDOpKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/large_scale_kv.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LargeScaleFuseSGDOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override;
};
template <typename T>
class LargeScaleFuseSGDOpKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(
grad_var->IsType<framework::SelectedRows>(),
platform::errors::InvalidArgument(
"in large scale optimize, gradient should only be SelectedRows"));
const auto &grad = grad_var->Get<framework::SelectedRows>();
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad.rows().size() == 0) {
return;
}
framework::SelectedRows tmp_grad_merge;
const framework::SelectedRows *grad_merge_ptr;
math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
&tmp_grad_merge, true);
grad_merge_ptr = &tmp_grad_merge;
std::vector<int64_t> in_rows;
in_rows.reserve(grad_merge_ptr->rows().size());
std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
std::back_inserter(in_rows));
const auto *lr = learning_rate->data<T>();
auto grad_v = grad_merge_ptr->value();
auto grad_width = grad_v.dims()[1];
// auto is_entry = context.Attr<bool>("is_entry");
auto tablename = ctx.Attr<std::string>("tablename");
auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
std::vector<std::vector<std::vector<float> *>> values;
std::vector<int64_t> dims;
auto *ins = distributed::LargeScaleKV::GetInstance();
auto *table = ins->Get(tablename);
table->Get(in_rows, value_names, &values);
table->Dims({"Param"}, &dims);
PADDLE_ENFORCE_EQ(dims[0], grad_width,
platform::errors::InvalidArgument(
"param_row should have the same size with grad_row"));
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
std::vector<T> grads;
framework::TensorToVector(grad_v, ctx.device_context(), &grads);
blas.SCAL(grads.size(), lr[0], grads.data());
for (int x = 0; x < static_cast<int>(in_rows.size()); ++x) {
auto &params = values[x][0];
blas.VSUB(grad_width, params->data(), grads.data() + grad_width * x,
params->data());
}
}
};
} // namespace operators
} // namespace paddle
...@@ -37,12 +37,6 @@ class RecvOp : public framework::OperatorBase { ...@@ -37,12 +37,6 @@ class RecvOp : public framework::OperatorBase {
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
int do_not_run = Attr<int>("do_not_run");
if (do_not_run) {
VLOG(3) << "recv do not run!";
return;
}
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap"); std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
std::vector<std::string> varnames = std::vector<std::string> varnames =
Attr<std::vector<std::string>>("varnames"); Attr<std::vector<std::string>>("varnames");
...@@ -63,11 +57,10 @@ class RecvOp : public framework::OperatorBase { ...@@ -63,11 +57,10 @@ class RecvOp : public framework::OperatorBase {
if (recv_varnames.size() > 0) { if (recv_varnames.size() > 0) {
auto *communicator = distributed::Communicator::GetInstance(); auto *communicator = distributed::Communicator::GetInstance();
if (communicator == nullptr) { if (communicator != nullptr) {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"need run fleet.init_worker first")); "execute startup program must before fleet.init_worker"));
} }
communicator->RecvNoBarrier();
} else { } else {
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
if (with_barrier) { if (with_barrier) {
......
...@@ -34,16 +34,16 @@ inline bool NeedSend(const framework::Scope& scope, ...@@ -34,16 +34,16 @@ inline bool NeedSend(const framework::Scope& scope,
std::string::npos) std::string::npos)
return false; return false;
auto* var = scope.FindVar(varname); auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", PADDLE_ENFORCE_NOT_NULL(
varname); var, platform::errors::NotFound(
"Can not find variable '%s' in the send side.", varname));
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized(); return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) { } else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().rows().size() > 0UL; return var->Get<framework::SelectedRows>().rows().size() > 0UL;
} else { } else {
PADDLE_THROW( PADDLE_THROW(platform::errors::Unimplemented(
"Variable type in send side should be in " "Variable type in send side should be LodTensor or SelectedRows."));
"[LodTensor, SelectedRows]");
} }
return false; return false;
} }
......
...@@ -8,7 +8,8 @@ register_operators(EXCLUDES ...@@ -8,7 +8,8 @@ register_operators(EXCLUDES
multihead_matmul_op multihead_matmul_op
fused_embedding_eltwise_layernorm_op fused_embedding_eltwise_layernorm_op
fusion_group_op fusion_group_op
fusion_gru_op) fusion_gru_op
fused_bn_add_activation_op)
# fusion_gru_op does not have CUDA kernel # fusion_gru_op does not have CUDA kernel
op_library(fusion_gru_op) op_library(fusion_gru_op)
...@@ -47,4 +48,9 @@ if (WITH_GPU) ...@@ -47,4 +48,9 @@ if (WITH_GPU)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op) cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
endif() endif()
# fused_bn_add_activation
if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
op_library(fused_bn_add_activation_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
endif()
endif() endif()
...@@ -204,6 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -204,6 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(transformed_input.dims()); auto x_dims = framework::vectorize(transformed_input.dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
if (!exhaustive_search) { if (!exhaustive_search) {
#if CUDNN_VERSION >= 8000
int perf_count; int perf_count;
int best_algo_idx = 0; int best_algo_idx = 0;
size_t tmp_size = 0; size_t tmp_size = 0;
...@@ -215,13 +216,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -215,13 +216,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count, cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
perf_results.get())); perf_results.get()));
algo = (perf_results.get())[best_algo_idx].algo; algo = (perf_results.get())[best_algo_idx].algo;
VLOG(3) << "cuDNN forward algo " << algo;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
if (workspace_size_in_bytes > workspace_size_limit) if (workspace_size_in_bytes > workspace_size_limit)
workspace_size_limit = workspace_size_in_bytes; workspace_size_limit = workspace_size_in_bytes;
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
VLOG(3) << "cuDNN forward algo " << algo;
#endif
} else { } else {
std::function<cudnnConvolutionFwdAlgo_t()> search_func = std::function<cudnnConvolutionFwdAlgo_t()> search_func =
[&]() -> cudnnConvolutionFwdAlgo_t { [&]() -> cudnnConvolutionFwdAlgo_t {
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
void FusedBatchNormAddActOp::InferShape(
framework::InferShapeContext *ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedBatchNormAddActOp");
OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "FusedBatchNormAddActOp");
OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
"FusedBatchNormAddActOp");
OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias",
"FusedBatchNormAddActOp");
// check output
OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedBatchNormAddActOp");
OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut",
"FusedBatchNormAddActOp");
OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
"FusedBatchNormAddActOp");
OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
"FusedBatchNormAddActOp");
OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
"FusedBatchNormAddActOp");
const auto x_dims = ctx->GetInputDim("X");
const auto z_dims = ctx->GetInputDim("Z");
PADDLE_ENFORCE_EQ(x_dims, z_dims,
platform::errors::InvalidArgument(
"ShapeError: the shapes of input "
"must be equal. But received: the shape "
"of input X = [%s], and the shape of "
"input Y = [%s]",
x_dims, z_dims));
PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
"ShapeError: the dimensions of input "
"must greater than or equal to 2."
"But received: the shape of input "
"= [%s], the dimension of input = "
"[%d]",
x_dims, x_dims.size()));
PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
"ShapeError: the dimensions of input "
"must smaller than or equal to 5."
"But received: the shape of input "
"= [%s], the dimension of input = "
"[%d]",
x_dims, x_dims.size()));
const int64_t C = x_dims[x_dims.size() - 1];
auto scale_dim = ctx->GetInputDim("Scale");
auto bias_dim = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(
scale_dim.size(), 1UL,
platform::errors::InvalidArgument(
"ShapeError: the dimension of scale must equal to 1."
"But received: the shape of scale is [%s], the dimension "
"of scale is [%d]",
scale_dim, scale_dim.size()));
PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
platform::errors::InvalidArgument(
"ShapeError: the dimension of bias must equal to 1."
"But received: the shape of bias is [%s],the dimension "
"of bias is [%d]",
bias_dim, bias_dim.size()));
bool check = true;
if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
framework::product(bias_dim) <= 0)) {
check = false;
}
if (check) {
PADDLE_ENFORCE_EQ(scale_dim[0], C,
platform::errors::InvalidArgument(
"ShapeError: the shape of scale must equal to [%d]"
"But received: the shape of scale is [%d]",
C, scale_dim[0]));
PADDLE_ENFORCE_EQ(bias_dim[0], C,
platform::errors::InvalidArgument(
"ShapeError: the shape of bias must equal to [%d]"
"But received: the shape of bias is [%d]",
C, bias_dim[0]));
}
ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("MeanOut", {C});
ctx->SetOutputDim("VarianceOut", {C});
ctx->SetOutputDim("SavedMean", {C});
ctx->SetOutputDim("SavedVariance", {C});
ctx->ShareLoD("X", "Y");
}
framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
// By default, the type of the scale, bias, mean,
// and var tensors should be float when input tensor's dtype is float16.
auto bn_param_type = framework::proto::VarType::FP32;
PADDLE_ENFORCE_EQ(
bn_param_type, ctx.Input<Tensor>("Scale")->type(),
platform::errors::InvalidArgument("Scale input should be of float type"));
PADDLE_ENFORCE_EQ(
bn_param_type, ctx.Input<Tensor>("Bias")->type(),
platform::errors::InvalidArgument("Bias input should be of float type"));
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library);
}
void FusedBatchNormAddActOpMaker::Make() {
AddInput("X", "The input tensor");
AddInput("Z", "The input tensor");
AddInput("Scale",
"Scale is a 1-dimensional tensor of size C "
"that is applied to the output");
AddInput("Bias",
"Bias is a 1-dimensional tensor of size C "
"that is applied to the output");
AddOutput("Y", "result after normalization");
AddOutput("MeanOut",
"Share memory with Mean. "
"Store the global mean when training");
AddOutput("VarianceOut",
"Share memory with Variance. "
"Store the global Variance when training");
AddOutput("SavedMean",
"Mean of the current mini batch, "
"will apply to output when training")
.AsIntermediate();
AddOutput("SavedVariance",
"Variance of the current mini batch, "
"will apply to output when training")
.AsIntermediate();
AddOutput("ReserveSpace",
"Reserve GPU space for triggering the new semi-persistent "
"NHWC kernel");
AddAttr<float>("momentum", "").SetDefault(0.9);
AddAttr<float>("epsilon", "")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
platform::errors::InvalidArgument(
"'epsilon' should be between 0.0 and 0.001."));
});
AddAttr<std::string>("act_type", "The activation type to be fused.")
.SetDefault("relu");
AddComment(R"DOC(
Fused Batch Normalization with activation.
Batch Norm has been implemented as discussed in the paper:
https://arxiv.org/pdf/1502.03167.pdf
Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
Now, the required data format for FusedBatchNormAddActOp is NHWC `[batch, in_height, in_width, in_channels]`.
)DOC");
}
void FusedBatchNormAddActGradOp::InferShape(
framework::InferShapeContext *ctx) const {
// check input
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
"FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z",
"FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
"FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
"FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
"FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
framework::GradVarName("Y"), "FusedBatchNormAddActGradOp");
// check output
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
framework::GradVarName("X"), "FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output",
framework::GradVarName("Z"), "FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale")), "Output",
framework::GradVarName("Scale"), "FusedBatchNormAddActGradOp");
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output",
framework::GradVarName("Bias"), "FusedBatchNormAddActGradOp");
const auto in_dims = ctx->GetInputDim("X");
const int C = in_dims[in_dims.size() - 1];
ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
ctx->SetOutputDim(framework::GradVarName("Z"), in_dims);
ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
}
framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
PADDLE_THROW(platform::errors::NotFound(
"Can not find Y@GRAD in the execution context."));
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW(
platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
}
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
library);
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
fused_bn_add_activation, ops::FusedBatchNormAddActOp,
ops::FusedBatchNormAddActOpMaker, ops::FusedBatchNormAddActOpInferVarType,
ops::FusedBatchNormAddActGradOpMaker<paddle::framework::OpDesc>,
ops::FusedBatchNormAddActGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(fused_bn_add_activation_grad,
ops::FusedBatchNormAddActGradOp);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T>
class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
float momentum = ctx.Attr<float>("momentum");
std::string act_type = ctx.Attr<std::string>("act_type");
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
// Get the size for each dimension.
// NHWC [batch_size, in_height, in_width, in_channels]
const auto *x = ctx.Input<Tensor>("X");
const auto *z = ctx.Input<Tensor>("Z");
const auto &in_dims = x->dims();
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
int N, C, H, W, D;
const DataLayout data_layout = DataLayout::kNHWC;
ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
// ------------------- cudnn descriptors ---------------------
auto handle = dev_ctx.cudnn_handle();
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
std::vector<int> dims = {N, C, H, W, D};
std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_));
double this_factor = 1. - momentum;
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
platform::ScopedActivationDescriptor scope_act_desc;
cudnnActivationDescriptor_t activation_desc_ =
scope_act_desc.descriptor<T>(act_type);
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
platform::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/bnOps_,
/*xDesc=*/data_desc_,
/*zDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/bnOps_,
/*activationDesc=*/activation_desc_,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
workspace_size);
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
data_desc_, z->template data<T>(), data_desc_,
y->template data<T>(), bn_param_desc_,
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
reserve_space_size));
// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
}
};
template <typename T>
class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
std::string act_type = ctx.Attr<std::string>("act_type");
const auto *x = ctx.Input<Tensor>("X");
const auto *z = ctx.Input<Tensor>("Z");
const auto *y = ctx.Input<Tensor>("Y");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
const auto &in_dims = x->dims();
int N, C, H, W, D;
const DataLayout data_layout = DataLayout::kNHWC;
ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_z = ctx.Output<Tensor>(framework::GradVarName("Z"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
d_z->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE_EQ(
d_scale && d_bias, true,
platform::errors::PreconditionNotMet(
"Both the scale grad and the bias grad must not be null."));
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL,
platform::errors::PreconditionNotMet(
"The scale only has one dimension."));
PADDLE_ENFORCE_EQ(
scale->dims()[0], C,
platform::errors::PreconditionNotMet(
"The size of scale is equal to the channel of Input(X)."));
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
std::vector<int> dims = {N, C, H, W, D};
std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
// ------------------- cudnn descriptors ---------------------
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_));
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const auto *saved_mean_data =
saved_mean->template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
size_t workspace_size = 0;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size();
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
platform::ScopedActivationDescriptor scope_act_desc;
cudnnActivationDescriptor_t activation_desc_ =
scope_act_desc.descriptor<T>(act_type);
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/bnOps_,
/*xDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*dyDesc=*/data_desc_,
/*dzDesc=*/data_desc_,
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/activation_desc_,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
workspace_size);
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackwardEx(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/bnOps_,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xDesc=*/data_desc_,
/*xData=*/x->template data<T>(),
/*yDesc=*/data_desc_,
/*yData=*/y->template data<T>(),
/*dyDesc=*/data_desc_,
/*dyData=*/d_y->template data<T>(),
/*dzDesc=*/data_desc_,
/*dzData=*/d_z->template data<T>(),
/*dxDesc=*/data_desc_,
/*dxData=*/d_x->template data<T>(),
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
/*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
/*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
/*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesmc=*/activation_desc_,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
// clean when exit.
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
}
};
} // namespace operators
} // namespace paddle
#if CUDNN_VERSION >= 7401
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
fused_bn_add_activation,
ops::FusedBatchNormAddActKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(fused_bn_add_activation_grad,
ops::FusedBatchNormAddActGradKernel<
plat::CUDADeviceContext, plat::float16>);
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
template <typename T>
class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType(this->ForwardOpType() + "_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("Z", this->Input("Z"));
op->SetInput("Y", this->Output("Y"));
op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
op->SetInput("Scale", this->Input("Scale"));
op->SetInput("Bias", this->Input("Bias"));
op->SetInput("SavedMean", this->Output("SavedMean"));
op->SetInput("SavedVariance", this->Output("SavedVariance"));
op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
op->SetAttrMap(this->Attrs());
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
}
};
class FusedBatchNormAddActOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
const override {
static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
return m;
}
};
template <typename DeviceContext, typename T>
class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
} // namespace operators
} // namespace paddle
...@@ -47,7 +47,9 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -47,7 +47,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
else if (act_type == relu) else if (act_type == relu)
ReluFunctor<T>()(d, x, y); ReluFunctor<T>()(d, x, y);
else else
PADDLE_THROW("unsupported activation type"); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported activation type, only supports identity, sigmoid, tanh "
"and relu."));
} }
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
...@@ -137,7 +139,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -137,7 +139,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
else if (act_type == relu) else if (act_type == relu)
ReluGradFunctor<T>()(d, x, y, dy, dx); ReluGradFunctor<T>()(d, x, y, dy, dx);
else else
PADDLE_THROW("unsupported activation type"); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported activation type, only supports identity, sigmoid, tanh "
"and relu."));
} }
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
......
...@@ -104,12 +104,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { ...@@ -104,12 +104,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
auto dim_x = ctx->GetInputDim("X"); auto dim_x = ctx->GetInputDim("X");
auto interp_method = ctx->Attrs().Get<std::string>("interp_method"); auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
PADDLE_ENFORCE( PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
"bilinear" == interp_method || "nearest" == interp_method || "bicubic" == interp_method,
"bicubic" == interp_method, true, platform::errors::InvalidArgument(
"Interpolation method can only be \"bilinear\" or \"nearest\" when " "Interpolation method can only be \"bilinear\" "
"Input(X) dimension is 4, but got method = %s .", "or \"nearest\" or \"bicubic\" when "
interp_method); "Input(X) dimension is 4, but got method is %s.",
interp_method));
const DataLayout data_layout = framework::StringToDataLayout( const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout")); ctx->Attrs().Get<std::string>("data_layout"));
...@@ -169,13 +170,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { ...@@ -169,13 +170,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
auto out_size_dim = ctx->GetInputDim("OutSize"); auto out_size_dim = ctx->GetInputDim("OutSize");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
out_size_dim.size(), 1, out_size_dim.size(), 1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument("OutSize's dimension size must be 1, "
"OutSize's dimension size must be 1, but got dimension = %d .", "but got dimension size is %d .",
out_size_dim.size())); out_size_dim.size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
out_size_dim[0], 2, out_size_dim[0], 2,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"OutSize's dim[0] must be 2, but got dimention = %d .", "OutSize's dimension[0] must be 2, but got dimension[0] is %d .",
out_size_dim[0])); out_size_dim[0]));
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
return; return;
...@@ -264,12 +265,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) { ...@@ -264,12 +265,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
if (ctx->HasInput("OutSize") && ctx->IsRuntime()) { if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
auto out_size_dim = ctx->GetInputDim("OutSize"); auto out_size_dim = ctx->GetInputDim("OutSize");
PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, PADDLE_ENFORCE_EQ(
"OutSize's dimension size must be 1, but got size =%d .", out_size_dim.size(), 1,
out_size_dim.size()); platform::errors::InvalidArgument(
"OutSize's dimension size must be 1, but got size is %d.",
out_size_dim.size()));
PADDLE_ENFORCE_EQ(out_size_dim[0], 3, PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
"OutSize's dim[0] must be 3, but got size = %d .", platform::errors::InvalidArgument(
out_size_dim[0]); "OutSize's dim[0] must be 3, but got size is %d.",
out_size_dim[0]));
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
return; return;
} }
...@@ -289,10 +293,8 @@ class InterpolateOp : public framework::OperatorWithKernel { ...@@ -289,10 +293,8 @@ class InterpolateOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Interpolate");
"Input(X) of InterpolateOp should not be null."); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Interpolate");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of InterpolationOp should not be null.");
auto dim_x = ctx->GetInputDim("X"); // NCHW format auto dim_x = ctx->GetInputDim("X"); // NCHW format
PADDLE_ENFORCE( PADDLE_ENFORCE(
...@@ -534,9 +536,10 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { ...@@ -534,9 +536,10 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InterpolateGrad");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
"Input(Out@GRAD) should not be null"); "Out@GRAD", "InterpolateGrad");
auto dim_x = ctx->GetInputDim("X"); auto dim_x = ctx->GetInputDim("X");
if (ctx->HasOutput(framework::GradVarName("X"))) { if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), dim_x); ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/linspace_op.h" #include "paddle/fluid/operators/linspace_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,7 +22,7 @@ class LinspaceOp : public framework::OperatorWithKernel { ...@@ -21,7 +22,7 @@ class LinspaceOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace"); OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace"); OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace"); OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
...@@ -50,11 +51,17 @@ class LinspaceOp : public framework::OperatorWithKernel { ...@@ -50,11 +51,17 @@ class LinspaceOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::proto::VarType::Type(ctx.Attr<int>("dtype")), framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
ctx.GetPlace()); ctx.GetPlace());
} }
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override {
return expected_kernel_type;
}
}; };
class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto level = static_cast<size_t>(Attr<int>("level")); auto level = static_cast<size_t>(Attr<int>("level"));
PADDLE_ENFORCE(in_true.numel() || in_false.numel(), PADDLE_ENFORCE_EQ(
"Input(InTrue) or Input(InFalse) should be initialized."); in_true.numel() || in_false.numel(), true,
platform::errors::InvalidArgument(
"Input(InTrue) or Input(InFalse) should be initialized."));
auto &mask_dim = mask.dims(); auto &mask_dim = mask.dims();
std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()}; std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
...@@ -56,7 +58,9 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -56,7 +58,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx, framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
cpu_mask.get()); cpu_mask.get());
#else #else
PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); PADDLE_THROW(platform::errors::PreconditionNotMet(
"Not supported GPU, Please recompile or reinstall paddle with CUDA "
"support."));
#endif #endif
} }
auto *mask_data = cpu_mask->data<bool>(); auto *mask_data = cpu_mask->data<bool>();
...@@ -109,7 +113,11 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -109,7 +113,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
size_t start_offset = lod_and_offset.second.first; size_t start_offset = lod_and_offset.second.first;
size_t end_offset = lod_and_offset.second.second; size_t end_offset = lod_and_offset.second.second;
PADDLE_ENFORCE_GE(end_offset, start_offset); PADDLE_ENFORCE_GE(end_offset, start_offset,
platform::errors::InvalidArgument(
"The end offset less than start offset, end offset "
"is %d, start offset is %d.",
end_offset, start_offset));
size_t len = end_offset - start_offset; size_t len = end_offset - start_offset;
if (len == 0) { if (len == 0) {
continue; continue;
...@@ -189,22 +197,24 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase { ...@@ -189,22 +197,24 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
"merge_lod_tensor"); "merge_lod_tensor");
auto mask_dim = context->GetInputDim("Mask"); auto mask_dim = context->GetInputDim("Mask");
PADDLE_ENFORCE_EQ(mask_dim.size(), 2, PADDLE_ENFORCE_EQ(mask_dim.size(), 2,
"If you are using IfElse OP:" platform::errors::InvalidArgument(
"\n\nie = fluid.layers.IfElse(cond=cond)\nwith " "If you are using IfElse OP:"
"ie.true_block():\n out_1 = ie.input(x)\n\n" "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
"Please ensure that the cond should be a 2-D tensor and " "ie.true_block():\n out_1 = ie.input(x)\n\n"
"the second dim size of cond should be 1. " "Please ensure that the cond is a 2-D tensor and "
"But now the cond's shape is [", "the second dim size of cond is 1. "
*mask_dim.Get(), "].\n"); "But now the cond's shape is [%s].\n",
mask_dim));
if (context->IsRuntime() || mask_dim[1] > 0) { if (context->IsRuntime() || mask_dim[1] > 0) {
PADDLE_ENFORCE_EQ(mask_dim[1], 1, PADDLE_ENFORCE_EQ(mask_dim[1], 1,
"If you are using IfElse OP:" platform::errors::InvalidArgument(
"\n\nie = fluid.layers.IfElse(cond=cond)\nwith " "If you are using IfElse OP:"
"ie.true_block():\n out_1 = ie.input(x)\n\n" "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
"Please ensure that the cond should be a 2-D tensor " "ie.true_block():\n out_1 = ie.input(x)\n\n"
"and the second dim size of cond should be 1. " "Please ensure that the cond is a 2-D tensor "
"But now the cond's shape is [", "and the second dim size of cond is 1. "
*mask_dim.Get(), "].\n"); "But now the cond's shape is [%s].\n",
mask_dim));
} }
context->SetOutputDim("Out", context->GetInputDim("InTrue")); context->SetOutputDim("Out", context->GetInputDim("InTrue"));
......
...@@ -23,46 +23,54 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { ...@@ -23,46 +23,54 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param",
"Input(Param) of DecayedAdagradOp should not be null."); "DecayedAdagradOp");
PADDLE_ENFORCE(ctx->HasInput("Grad"), OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp");
"Input(Grad) of DecayedAdagradOp should not be null."); OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment",
PADDLE_ENFORCE(ctx->HasInput("Moment"), "DecayedAdagradOp");
"Input(Moment) of DecayedAdagradOp should not be null."); OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
PADDLE_ENFORCE( "DecayedAdagradOp");
ctx->HasInput("LearningRate"), PADDLE_ENFORCE_EQ(
"Input(LearningRate) of DecayedAdagradOp should not be null."); ctx->GetInputsVarType("Param").front(),
PADDLE_ENFORCE( framework::proto::VarType::LOD_TENSOR,
ctx->GetInputsVarType("Param").front() == platform::errors::InvalidArgument(
framework::proto::VarType::LOD_TENSOR, "The input var's type should be LoDTensor, but the received is %s",
"The input var's type should be LoDTensor, but the received is %s", ctx->Inputs("Param").front(),
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE( PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front() == ctx->GetInputsVarType("Grad").front(),
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", platform::errors::InvalidArgument(
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); "The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(),
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), ctx->GetInputsVarType("Grad").front()));
"Output(ParamOut) of DecayedAdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut",
"Output(MomentOut) of DecayedAdagradOp should not be null."); "DecayedAdagradOp");
OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
"DecayedAdagradOp");
auto lr_dims = ctx->GetInputDim("LearningRate"); auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not " platform::errors::InvalidArgument(
"been initialized. You may need to confirm " "Maybe the Input variable LearningRate has not "
"if you put exe.run(startup_program) " "been initialized. You may need to confirm "
"after optimizer.minimize function."); "if you put exe.run(startup_program) "
"after optimizer.minimize function."));
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"LearningRate should have one element"); platform::errors::InvalidArgument(
"LearningRate should have one element"));
auto param_dims = ctx->GetInputDim("Param"); auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"), PADDLE_ENFORCE_EQ(
"Param and Grad input of DecayedAdagradOp should have " param_dims, ctx->GetInputDim("Grad"),
"the same dimension."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"), "Param and Grad input of DecayedAdagradOp should have "
"Param and Moment input of DecayedAdagradOp should have " "the same dimension."));
"the same dimension."); PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment"),
platform::errors::InvalidArgument(
"Param and Moment input of DecayedAdagradOp should have "
"the same dimension."));
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("MomentOut", param_dims); ctx->SetOutputDim("MomentOut", param_dims);
......
...@@ -24,17 +24,19 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> { ...@@ -24,17 +24,19 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param"); const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Param").front(), "but the received is %s",
framework::ToTypeName(param_var->Type())); ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
const auto* grad_var = ctx.InputVar("Grad"); const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, " platform::errors::InvalidArgument(
"but the received is %s", "The Var(%s)'s type should be LoDTensor, "
ctx.InputNames("Grad").front(), "but the received is %s",
framework::ToTypeName(grad_var->Type())); ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())));
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
......
...@@ -30,7 +30,12 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> { ...@@ -30,7 +30,12 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate"); auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
auto* grad_var = ctx.InputVar("Grad"); auto* grad_var = ctx.InputVar("Grad");
// only support dense for now. // only support dense for now.
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true); PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())));
auto grad = ctx.Input<framework::LoDTensor>("Grad"); auto grad = ctx.Input<framework::LoDTensor>("Grad");
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
......
...@@ -60,20 +60,33 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -60,20 +60,33 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(), PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
"src and dst tensor should have the same dims size."); platform::errors::InvalidArgument(
"Source and destination tensor should have the same "
"dimension size, but source tensor dimension size is "
"%u, destination tensor size is %u.",
src_stride_numel.size(), dst_stride_numel.size()));
for (int64_t i = 0; i < axis; ++i) { for (int64_t i = 0; i < axis; ++i) {
if (i < axis) { if (i < axis) {
PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis], PADDLE_ENFORCE_EQ(
dst_stride_numel[i] / dst_stride_numel[axis], src_stride_numel[i] / src_stride_numel[axis],
"src and dst should have the same elements " dst_stride_numel[i] / dst_stride_numel[axis],
"except the specified axis."); platform::errors::InvalidArgument(
"Source and destination tensor should have the same number of "
"elements except the specified axis, but the source elements "
"number is %d, destination elements number is %d.",
src_stride_numel[i] / src_stride_numel[axis],
dst_stride_numel[i] / dst_stride_numel[axis]));
} else if (i == axis) { } else if (i == axis) {
continue; continue;
} else { } else {
PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i], PADDLE_ENFORCE_EQ(
"src and dst should have the same elements " src_stride_numel[i], dst_stride_numel[i],
"except the specified axis."); platform::errors::InvalidArgument(
"Source and destination tensor should have the same number of "
"elements except the specified axis, but the source elements "
"number is %d, destination elements number is %d.",
src_stride_numel[i], dst_stride_numel[i]));
} }
} }
...@@ -90,7 +103,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -90,7 +103,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
memory::Copy(gpu_place, dst + i * dst_after, gpu_place, memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
src + i * src_after, sizeof(T) * size, cuda_ctx.stream()); src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
#else #else
PADDLE_THROW("Paddle is not compiled with GPU"); PADDLE_THROW(platform::errors::PreconditionNotMet(
"Paddle is not compiled with GPU."));
#endif #endif
} }
} }
......
...@@ -78,21 +78,35 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const { ...@@ -78,21 +78,35 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
platform::errors::NotFound("Col(Output) of VarConv2dOP is not found.")); platform::errors::NotFound("Col(Output) of VarConv2dOP is not found."));
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, PADDLE_ENFORCE_EQ(
"The rank of X(Input) can't be less than 2."); x_dims.size(), 2,
platform::errors::InvalidArgument(
"The rank of X(Input) can't be less than 2, but received rank is %u.",
x_dims.size()));
auto w_dims = ctx->GetInputDim("W"); auto w_dims = ctx->GetInputDim("W");
PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor"); PADDLE_ENFORCE_EQ(
w_dims.size(), 2,
platform::errors::InvalidArgument(
"Input W should be a 2-D tensor, but its actual dimension is %u.",
w_dims.size()));
int output_channel = ctx->Attrs().Get<int>("OutputChannel"); int output_channel = ctx->Attrs().Get<int>("OutputChannel");
int input_channel = ctx->Attrs().Get<int>("InputChannel"); int input_channel = ctx->Attrs().Get<int>("InputChannel");
int kernel_h = ctx->Attrs().Get<int>("KernelH"); int kernel_h = ctx->Attrs().Get<int>("KernelH");
int kernel_w = ctx->Attrs().Get<int>("KernelW"); int kernel_w = ctx->Attrs().Get<int>("KernelW");
PADDLE_ENFORCE_EQ(w_dims[0], output_channel, PADDLE_ENFORCE_EQ(
"W dim[0] should be equal to OutputChannel"); w_dims[0], output_channel,
platform::errors::InvalidArgument(
"Input W's dimension[0] should be equal to OutputChannel, the "
"dimension[0] is %d, OutputChannel is %d.",
w_dims[0], output_channel));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
w_dims[1], input_channel * kernel_h * kernel_w, w_dims[1], input_channel * kernel_h * kernel_w,
"W dim[1] should be equal to InputChannel * StrideH * StrideW"); platform::errors::InvalidArgument(
"Input W's dimension[1] should be equal to InputChannel * StrideH * "
"StrideW, the dimension[1] is %d, expected value is %d.",
w_dims[1], input_channel * kernel_h * kernel_w));
if (ctx->IsRuntime()) { if (ctx->IsRuntime()) {
framework::Variable* x_var = framework::Variable* x_var =
...@@ -103,10 +117,14 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const { ...@@ -103,10 +117,14 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP " platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP "
"does not contain LoD information.")); "does not contain LoD information."));
PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted."); PADDLE_ENFORCE_GE(x_lod.size(), 1,
PADDLE_ENFORCE_EQ( platform::errors::InvalidArgument(
x_dims[0], static_cast<int64_t>(x_lod[0].back()), "The Input(X)'s lod info is corrupted."));
"The Input(X)'s lod info mismatches the actual tensor shape."); PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()),
platform::errors::InvalidArgument(
"The Input(X)'s lod info mismatches the actual "
"tensor shape, input lod is %s, tensor shape is %s.",
x_lod, x_dims));
framework::Variable* row_var = framework::Variable* row_var =
BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]); BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]);
......
...@@ -24,7 +24,11 @@ namespace platform { ...@@ -24,7 +24,11 @@ namespace platform {
void CudaProfilerInit(std::string output_file, std::string output_mode, void CudaProfilerInit(std::string output_file, std::string output_mode,
std::string config_file) { std::string config_file) {
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
platform::errors::InvalidArgument(
"Unsupported cuda profiler output mode, expect `kvp` or "
"`csv`, but received `%s`.",
output_mode));
cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode)); cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
......
...@@ -30,6 +30,10 @@ CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); ...@@ -30,6 +30,10 @@ CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
#endif #endif
...@@ -54,6 +58,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP); ...@@ -54,6 +58,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R8
CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
#endif
bool HasCUDNN() { bool HasCUDNN() {
std::call_once(cudnn_dso_flag, std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
......
...@@ -134,6 +134,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -134,6 +134,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionForwardAlgorithm); \ __macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnSetRNNDescriptor); __macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
......
...@@ -649,61 +649,47 @@ void BindImperative(py::module *m_ptr) { ...@@ -649,61 +649,47 @@ void BindImperative(py::module *m_ptr) {
return self.NewVarBase(tensor.place(), false); return self.NewVarBase(tensor.place(), false);
}, },
py::return_value_policy::copy, R"DOC( py::return_value_policy::copy, R"DOC(
**Notes**:
**This API is ONLY available in Dygraph mode**
Returns a new Variable, detached from the current graph. Returns a new Tensor, detached from the current graph.
Returns:
( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
Returns: The detached Tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
from paddle.fluid.dygraph.base import to_variable paddle.disable_static()
from paddle.fluid.dygraph import Linear
import numpy as np
data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
with fluid.dygraph.guard():
linear = Linear(32, 64)
data = to_variable(data)
x = linear(data)
y = x.detach()
linear = Linear(32, 64)
data = paddle.uniform(shape=[30, 10, 32], -1, 1)
x = linear(data)
y = x.detach()
)DOC") )DOC")
.def("clear_gradient", &imperative::VarBase::ClearGradient, R"DOC( .def("clear_gradient", &imperative::VarBase::ClearGradient, R"DOC(
**Notes**: Only for Tensor that has gradient, normally we use this for Parameters since other temporary Tensor doesen't has gradient.
**1. This API is ONLY available in Dygraph mode**
**2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
Clear (set to ``0`` ) the Gradient of Current Variable The Gradient of current Tensor will be set to ``0`` .
Returns: None Returns: None
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
import numpy as np paddle.disable_static()
x = np.ones([2, 2], np.float32) inputs = []
with fluid.dygraph.guard(): for _ in range(10):
inputs2 = [] tmp = paddle.ones([2, 2])
for _ in range(10): tmp.stop_gradient=False
tmp = fluid.dygraph.base.to_variable(x) inputs.append(tmp)
tmp.stop_gradient=False ret = paddle.sums(inputs2)
inputs2.append(tmp) loss = paddle.reduce_sum(ret)
ret2 = fluid.layers.sums(inputs2) loss.backward()
loss2 = fluid.layers.reduce_sum(ret2) print("Before clear_gradient {}".format(loss.grad))
loss2.backward() loss.clear_gradient()
print(loss2.gradient()) print("After clear_gradient {}".format(loss.grad))
loss2.clear_gradient()
print("After clear {}".format(loss2.gradient()))
)DOC") )DOC")
.def("_run_backward", .def("_run_backward",
[](imperative::VarBase &self, const imperative::Tracer &tracer, [](imperative::VarBase &self, const imperative::Tracer &tracer,
......
...@@ -26,7 +26,7 @@ function(train_test TARGET_NAME) ...@@ -26,7 +26,7 @@ function(train_test TARGET_NAME)
ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/) ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
endif() endif()
set_tests_properties(test_train_${TARGET_NAME}${arg} set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES DEPENDS test_${TARGET_NAME}) PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
if(NOT WIN32 AND NOT APPLE) if(NOT WIN32 AND NOT APPLE)
set_tests_properties(test_train_${TARGET_NAME}${arg} set_tests_properties(test_train_${TARGET_NAME}${arg}
PROPERTIES TIMEOUT 150) PROPERTIES TIMEOUT 150)
......
...@@ -29,7 +29,9 @@ namespace train { ...@@ -29,7 +29,9 @@ namespace train {
void ReadBinaryFile(const std::string& filename, std::string* contents) { void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename); PADDLE_ENFORCE_EQ(
fin.is_open(), true,
platform::errors::Unavailable("Failed to open file %s.", filename));
fin.seekg(0, std::ios::end); fin.seekg(0, std::ios::end);
contents->clear(); contents->clear();
contents->resize(fin.tellg()); contents->resize(fin.tellg());
...@@ -70,7 +72,8 @@ int main() { ...@@ -70,7 +72,8 @@ int main() {
} }
} }
PADDLE_ENFORCE_NE(loss_name, "", "loss not found"); PADDLE_ENFORCE_NE(loss_name, "",
platform::errors::NotFound("Loss name is not found."));
// init all parameters // init all parameters
executor.Run(*startup_program, &scope, 0); executor.Run(*startup_program, &scope, 0);
......
...@@ -45,7 +45,9 @@ namespace train { ...@@ -45,7 +45,9 @@ namespace train {
void ReadBinaryFile(const std::string& filename, std::string* contents) { void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename); PADDLE_ENFORCE_EQ(
fin.is_open(), true,
platform::errors::Unavailable("Failed to open file %s.", filename));
fin.seekg(0, std::ios::end); fin.seekg(0, std::ios::end);
contents->clear(); contents->clear();
contents->resize(fin.tellg()); contents->resize(fin.tellg());
...@@ -98,7 +100,11 @@ int main(int argc, char* argv[]) { ...@@ -98,7 +100,11 @@ int main(int argc, char* argv[]) {
file_vec.push_back(filename); file_vec.push_back(filename);
} }
} }
PADDLE_ENFORCE_GE(file_vec.size(), 1, "At least one file to train"); PADDLE_ENFORCE_GE(
file_vec.size(), 1,
platform::errors::InvalidArgument(
"At least one file to train, but received number of file is %d.",
file_vec.size()));
paddle::framework::InitDevices(false); paddle::framework::InitDevices(false);
const auto cpu_place = paddle::platform::CPUPlace(); const auto cpu_place = paddle::platform::CPUPlace();
paddle::framework::Executor executor(cpu_place); paddle::framework::Executor executor(cpu_place);
...@@ -148,7 +154,9 @@ int main(int argc, char* argv[]) { ...@@ -148,7 +154,9 @@ int main(int argc, char* argv[]) {
const std::vector<paddle::framework::DataFeed*> readers = const std::vector<paddle::framework::DataFeed*> readers =
dataset_ptr->GetReaders(); dataset_ptr->GetReaders();
PADDLE_ENFORCE_EQ(readers.size(), 1, PADDLE_ENFORCE_EQ(readers.size(), 1,
"readers num should be equal to thread num"); platform::errors::InvalidArgument(
"Readers num(%d) should be equal to thread num(1).",
readers.size()));
readers[0]->SetPlace(paddle::platform::CPUPlace()); readers[0]->SetPlace(paddle::platform::CPUPlace());
const std::vector<std::string>& input_feed_names = const std::vector<std::string>& input_feed_names =
readers[0]->GetUseSlotAlias(); readers[0]->GetUseSlotAlias();
......
...@@ -51,7 +51,8 @@ void Train() { ...@@ -51,7 +51,8 @@ void Train() {
} }
} }
PADDLE_ENFORCE_NE(loss_name, "", "loss not found"); PADDLE_ENFORCE_NE(loss_name, "",
platform::errors::NotFound("Loss name is not found."));
// prepare data // prepare data
auto x_var = scope.Var("img"); auto x_var = scope.Var("img");
......
...@@ -26,6 +26,7 @@ wmic process where name="op_function_generator.exe" call terminate 2>NUL ...@@ -26,6 +26,7 @@ wmic process where name="op_function_generator.exe" call terminate 2>NUL
rem ------initialize common variable------ rem ------initialize common variable------
if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
if not defined BRANCH set BRANCH=develop if not defined BRANCH set BRANCH=develop
if not defined TENSORRT_ROOT set TENSORRT_ROOT="C:/TensorRT-5.1.5.0"
if not defined WITH_MKL set WITH_MKL=ON if not defined WITH_MKL set WITH_MKL=ON
if not defined WITH_GPU set WITH_GPU=OFF if not defined WITH_GPU set WITH_GPU=OFF
if not defined WITH_AVX set WITH_AVX=ON if not defined WITH_AVX set WITH_AVX=ON
...@@ -33,9 +34,11 @@ if not defined WITH_TESTING set WITH_TESTING=ON ...@@ -33,9 +34,11 @@ if not defined WITH_TESTING set WITH_TESTING=ON
if not defined WITH_PYTHON set WITH_PYTHON=ON if not defined WITH_PYTHON set WITH_PYTHON=ON
if not defined ON_INFER set ON_INFER=ON if not defined ON_INFER set ON_INFER=ON
if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
if not defined WITH_CACHE set WITH_CACHE=ON if not defined WITH_CACHE set WITH_CACHE=ON
if not defined WITH_TPCACHE set WITH_TPCACHE=ON if not defined WITH_TPCACHE set WITH_TPCACHE=ON
rem -------set cache build work directory----------- rem -------set cache build work directory-----------
if "%WITH_CACHE%"=="OFF" ( if "%WITH_CACHE%"=="OFF" (
rmdir build /s/q rmdir build /s/q
...@@ -99,6 +102,7 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 ...@@ -99,6 +102,7 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
:: set maximum cache size to 20G :: set maximum cache size to 20G
clcache.exe -M 21474836480 clcache.exe -M 21474836480
rem ------set cache third_party------ rem ------set cache third_party------
set cache_dir=%work_dir:Paddle=cache% set cache_dir=%work_dir:Paddle=cache%
dir %cache_dir% dir %cache_dir%
...@@ -138,6 +142,7 @@ exit /b 1 ...@@ -138,6 +142,7 @@ exit /b 1
:CASE_wincheck_mkl :CASE_wincheck_mkl
set WITH_MKL=ON set WITH_MKL=ON
set WITH_GPU=OFF set WITH_GPU=OFF
set MSVC_STATIC_CRT=ON
call :cmake || goto cmake_error call :cmake || goto cmake_error
call :build || goto build_error call :build || goto build_error
call :test_whl_pacakage || goto test_whl_pacakage_error call :test_whl_pacakage || goto test_whl_pacakage_error
...@@ -149,11 +154,13 @@ goto:success ...@@ -149,11 +154,13 @@ goto:success
:CASE_wincheck_openblas :CASE_wincheck_openblas
set WITH_MKL=OFF set WITH_MKL=OFF
set WITH_GPU=ON set WITH_GPU=ON
set MSVC_STATIC_CRT=OFF
rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
set WITH_INFERENCE_API_TEST=OFF set WITH_INFERENCE_API_TEST=OFF
call :cmake || goto cmake_error call :cmake || goto cmake_error
call :build || goto build_error call :build || goto build_error
call :test_whl_pacakage || goto test_whl_pacakage_error call :test_whl_pacakage || goto test_whl_pacakage_error
:: call :test_inference || goto test_inference_error
goto:success goto:success
rem "Other configurations are added here" rem "Other configurations are added here"
...@@ -172,12 +179,14 @@ set start=%start:~4,10% ...@@ -172,12 +179,14 @@ set start=%start:~4,10%
echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
-DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
goto:eof goto:eof
:cmake_error :cmake_error
...@@ -282,7 +291,9 @@ dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib ...@@ -282,7 +291,9 @@ dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH% set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4 ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
goto:eof goto:eof
...@@ -305,7 +316,7 @@ set end=%end:~4,10% ...@@ -305,7 +316,7 @@ set end=%end:~4,10%
call :timestamp "%start%" "%end%" "TestCases Total" call :timestamp "%start%" "%end%" "TestCases Total"
cd %work_dir%\paddle\fluid\inference\api\demo_ci cd %work_dir%\paddle\fluid\inference\api\demo_ci
%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
goto:eof goto:eof
:test_inference_error :test_inference_error
......
...@@ -605,7 +605,8 @@ class PaddleCloudRoleMaker(RoleMakerBase): ...@@ -605,7 +605,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
""" """
if not self._role_is_generated: if not self._role_is_generated:
self._generate_role() self._generate_role()
return len(self._get_pserver_endpoints()) return len(self._get_pserver_endpoints(
)) if self._get_pserver_endpoints() is not None else 0
def _node_num(self): def _node_num(self):
""" """
......
...@@ -220,12 +220,12 @@ class ParameterServerRuntime(RuntimeBase): ...@@ -220,12 +220,12 @@ class ParameterServerRuntime(RuntimeBase):
else: else:
model_dirname = None model_dirname = None
if self.role_maker._is_heter_worker():
self._init_worker()
executor = self._get_executor() executor = self._get_executor()
executor.run(fluid.default_startup_program()) executor.run(fluid.default_startup_program())
if self.role_maker._is_heter_worker():
self._init_worker()
if self.role_maker._is_heter_worker(): if self.role_maker._is_heter_worker():
return return
......
...@@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer ...@@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry
from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_ from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_
...@@ -57,7 +58,7 @@ __all__ = [ ...@@ -57,7 +58,7 @@ __all__ = [
'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat', 'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention', 'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice', 'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
'correlation' 'correlation', 'fused_bn_add_act'
] ]
...@@ -1625,3 +1626,191 @@ def correlation(x, ...@@ -1625,3 +1626,191 @@ def correlation(x,
}, },
outputs={"Output": output}) outputs={"Output": output})
return output return output
def fused_bn_add_act(x,
y,
momentum=0.9,
epsilon=1e-05,
param_attr=None,
bias_attr=None,
moving_mean_name=None,
moving_variance_name=None,
act=None,
name=None):
"""
This Op performs batch norm on input x, and adds the result to input y. Then
it performs activation on the sum. The data format of inputs must be NHWC
`[batch, in_height, in_width, in_channels]`.
Args:
x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
is float16.
y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
is float16.
momentum(float|Tensor, optional): The value used for the moving_mean and
moving_var computation. This should be a float number or a tensor with
shape [1] and data type as float32. The updated formula is:
:math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
:math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
Default is 0.9.
epsilon(float, optional): A value added to the denominator for
numerical stability. Default is 1e-5.
param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
If the Initializer of the param_attr is not set, the parameter is initialized
with Xavier. Default: None.
bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
If the Initializer of the bias_attr is not set, the bias is initialized zero.
Default: None.
moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
will save global mean with the string.
moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
will save global variance with the string.
act(string, optional): Activation type, linear|relu|prelu|...
name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
Usually name is no need to set and None by default.
Examples:
.. code-block:: python
import paddle.fluid as fluid
def build_program(main_program, startup_program):
with fluid.program_guard(main_program, startup_program):
x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
y = fluid.layers.data(name="y", shape=[1], dtype='int64')
conv1_1 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
stride=1,
padding=1,
act=None,
bias_attr=False,
data_format='NHWC')
conv1_2 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
stride=1,
padding=1,
act=None,
bias_attr=False,
data_format='NHWC')
bn = fluid.layers.batch_norm(
input=conv1_1,
act=None,
data_layout='NHWC')
fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(conv1_2, bn)
prediction = fluid.layers.fc(input=fused_bn_add_act, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=y)
loss = fluid.layers.mean(loss)
sgd = fluid.optimizer.SGD(learning_rate=0.001)
sgd = fluid.contrib.mixed_precision.decorate(
sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
sgd.minimize(loss)
return x, y, loss
iters = 5
batch_size = 16
support_gpu = fluid.is_compiled_with_cuda()
if support_gpu:
main_program = fluid.Program()
startup_program = fluid.Program()
place = fluid.CUDAPlace(0)
x, y, loss = build_program(main_program, startup_program)
feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
exe = fluid.Executor(place)
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe.run(startup_program)
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
"""
helper = LayerHelper('fused_bn_add_act', **locals())
check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
'fused_bn_add_act')
check_variable_and_dtype(y, 'input', ['float16', 'float32', 'float64'],
'fused_bn_add_act')
bn_param_dtype = core.VarDesc.VarType.FP32
x_shape = x.shape
channel_num = x_shape[-1]
param_shape = [channel_num]
# create parameter
scale = helper.create_parameter(
attr=helper.param_attr,
shape=param_shape,
dtype=bn_param_dtype,
default_initializer=Constant(1.0))
bias = helper.create_parameter(
attr=helper.bias_attr,
shape=param_shape,
dtype=bn_param_dtype,
is_bias=True)
mean = helper.create_parameter(
attr=ParamAttr(
name=moving_mean_name, initializer=Constant(0.0), trainable=False),
shape=param_shape,
dtype=bn_param_dtype)
mean.stop_gradient = True
variance = helper.create_parameter(
attr=ParamAttr(
name=moving_variance_name,
initializer=Constant(1.0),
trainable=False),
shape=param_shape,
dtype=bn_param_dtype)
variance.stop_gradient = True
# create output
# mean and mean_out share the same memory
mean_out = mean
# variance and variance out share the same memory
variance_out = variance
saved_mean = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
saved_variance = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
reserve_space = helper.create_variable_for_type_inference(
dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
batch_norm_out = helper.create_variable_for_type_inference(
core.VarDesc.VarType.FP16)
inputs = {
"X": x,
"Z": y,
"Scale": scale,
"Bias": bias,
}
attrs = {"epsilon": epsilon, 'momentum': momentum}
outputs = {
"Y": batch_norm_out,
"MeanOut": mean_out,
"VarianceOut": variance_out,
"SavedMean": saved_mean,
"SavedVariance": saved_variance,
"ReserveSpace": reserve_space
}
helper.append_op(
type="fused_bn_add_activation",
inputs=inputs,
outputs=outputs,
attrs=attrs)
return batch_norm_out
...@@ -135,6 +135,7 @@ gray_list = { ...@@ -135,6 +135,7 @@ gray_list = {
'get_tensor_from_selected_rows', 'get_tensor_from_selected_rows',
'sign', 'sign',
'cast', 'cast',
'fused_bn_add_activation',
} }
''' '''
# The set of ops that don't support fp16 calculation # The set of ops that don't support fp16 calculation
......
...@@ -69,8 +69,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): ...@@ -69,8 +69,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
] ]
for in_name in op.input_names: for in_name in op.input_names:
if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm': if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
if in_name != 'X': 'batch_norm', 'fused_bn_add_activation'
]:
if in_name not in {'X', 'Z'}:
continue continue
for in_var_name in op.input(in_name): for in_var_name in op.input(in_name):
in_var = block.var(in_var_name) in_var = block.var(in_var_name)
...@@ -102,7 +104,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): ...@@ -102,7 +104,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
op._set_attr('in_dtype', dest_dtype) op._set_attr('in_dtype', dest_dtype)
if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16: if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
for out_name in op.output_names: for out_name in op.output_names:
if op.type == 'batch_norm' and out_name != 'Y': if op.type in ['batch_norm', 'fused_bn_add_activation'
] and out_name != 'Y':
continue continue
for out_var_name in op.output(out_name): for out_var_name in op.output(out_name):
out_var = block.var(out_var_name) out_var = block.var(out_var_name)
......
...@@ -17,8 +17,7 @@ from __future__ import print_function ...@@ -17,8 +17,7 @@ from __future__ import print_function
from .. import core from .. import core
from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
from ..layers.layer_function_generator import OpProtoHolder from ..layers.layer_function_generator import OpProtoHolder
from ..layers import common_methods from . import no_grad
from . import to_variable, no_grad
import numpy as np import numpy as np
import six import six
...@@ -53,47 +52,25 @@ def monkey_patch_math_varbase(): ...@@ -53,47 +52,25 @@ def monkey_patch_math_varbase():
def astype(self, dtype): def astype(self, dtype):
""" """
**Notes**:
**The variable must be a** :ref:`api_fluid_Tensor`
Cast a variable to a specified data type. Cast a Tensor to a specified data type.
Args: Args:
dtype: The target data type.
self(Variable): The source variable
dtype: The target data type
Returns: Returns:
Variable: Variable with new dtype Tensor: a new Tensor with target dtype
Examples: Examples:
In Static Graph Mode:
.. code-block:: python
import paddle.fluid as fluid
startup_prog = fluid.Program()
main_prog = fluid.Program()
with fluid.program_guard(startup_prog, main_prog):
original_variable = fluid.data(name = "new_variable", shape=[2,2], dtype='float32')
new_variable = original_variable.astype('int64')
print("new var's dtype is: {}".format(new_variable.dtype))
In Dygraph Mode:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle
import numpy as np import numpy as np
x = np.ones([2, 2], np.float32) original_tensor = paddle.ones([2, 2])
with fluid.dygraph.guard(): print("original tensor's dtype is: {}".format(original_tensor.dtype))
original_variable = fluid.dygraph.to_variable(x) new_tensor = original_tensor.astype('float32')
print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype)) print("new tensor's dtype is: {}".format(new_tensor.dtype))
new_variable = original_variable.astype('int64')
print("new var's dtype is: {}, numpy dtype is {}".format(new_variable.dtype, new_variable.numpy().dtype))
""" """
if not isinstance(dtype, core.VarDesc.VarType): if not isinstance(dtype, core.VarDesc.VarType):
...@@ -147,6 +124,10 @@ def monkey_patch_math_varbase(): ...@@ -147,6 +124,10 @@ def monkey_patch_math_varbase():
def _ndim_(var): def _ndim_(var):
return len(var.shape) return len(var.shape)
@property
def _size_(var):
return np.prod(var.shape)
def _scalar_add_(var, value): def _scalar_add_(var, value):
return _scalar_elementwise_op_(var, 1.0, value) return _scalar_elementwise_op_(var, 1.0, value)
...@@ -208,7 +189,6 @@ def monkey_patch_math_varbase(): ...@@ -208,7 +189,6 @@ def monkey_patch_math_varbase():
__impl__.__doc__ = """ __impl__.__doc__ = """
{0} {0}
Args: Args:
self(Tensor): left hand Tensor
other_var(Tensor|float|int): right hand Tensor other_var(Tensor|float|int): right hand Tensor
Returns: Returns:
...@@ -217,23 +197,7 @@ def monkey_patch_math_varbase(): ...@@ -217,23 +197,7 @@ def monkey_patch_math_varbase():
__impl__.__name__ = method_name __impl__.__name__ = method_name
return __impl__ return __impl__
# Todo(zhouwei): implement dygraph template to adapt to any function, receive('op_type', 'arg_template')
# Such as _method_creator_('addmm', 'x, y, alpha=1.0, beta=1.0, name=None'). It can reduce call time.
def _method_creator_(op_type, arg_template=None):
def __impl__(self):
op = getattr(core.ops, op_type)
return op(self)
__impl__.__doc__ = """
See paddle.{}""".format(op_type)
__impl__.__name__ = op_type
return __impl__
varbase_methods = [ varbase_methods = [
# Type1: From custom fun or lambda
## b=-a
('__neg__', _neg_), ('__neg__', _neg_),
('__float__', _float_), ('__float__', _float_),
('__long__', _long_), ('__long__', _long_),
...@@ -244,8 +208,7 @@ def monkey_patch_math_varbase(): ...@@ -244,8 +208,7 @@ def monkey_patch_math_varbase():
('dim', lambda x: len(x.shape)), ('dim', lambda x: len(x.shape)),
('ndimension', lambda x: len(x.shape)), ('ndimension', lambda x: len(x.shape)),
('ndim', _ndim_), ('ndim', _ndim_),
('size', lambda x: x.shape), ('size', _size_),
# Type2: From Template that create core.ops automatically. It's recommended.
('__add__', ('__add__',
_binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)), _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
## a+b == b+a. Do not need to reverse explicitly ## a+b == b+a. Do not need to reverse explicitly
...@@ -283,31 +246,7 @@ def monkey_patch_math_varbase(): ...@@ -283,31 +246,7 @@ def monkey_patch_math_varbase():
('__le__', _binary_creator_('__le__', 'less_equal', False, None)), ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)), ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)), ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
('__array_ufunc__', None), ('__array_ufunc__', None)
('sigmoid', _method_creator_('sigmoid', 'name=None')),
('log_sigmoid', _method_creator_('logsigmoid', 'name=None')),
('exp', _method_creator_('exp', 'name=None')),
('tanh', _method_creator_('tanh', 'name=None')),
('atan', _method_creator_('atan', 'name=None')),
('tanh_shrink', _method_creator_('tanh_shrink', 'name=None')),
('sqrt', _method_creator_('sqrt', 'name=None')),
('rsqrt', _method_creator_('rsqrt', 'name=None')),
('abs', _method_creator_('abs', 'name=None')),
('ceil', _method_creator_('ceil', 'name=None')),
('floor', _method_creator_('floor', 'name=None')),
('cos', _method_creator_('cos', 'name=None')),
('acos', _method_creator_('acos', 'name=None')),
('asin', _method_creator_('asin', 'name=None')),
('sin', _method_creator_('sin', 'name=None')),
('sinh', _method_creator_('sinh', 'name=None')),
('cosh', _method_creator_('cosh', 'name=None')),
('round', _method_creator_('round', 'name=None')),
('reciprocal', _method_creator_('reciprocal', 'name=None')),
('square', _method_creator_('square', 'name=None')),
('softplus', _method_creator_('softplus', 'name=None')),
('softsign', _method_creator_('softsign', 'name=None')),
# Type3: Form module 'paddle.tensor' defaultly.
# It's not a goodway, because it will increase call time.
] ]
global _already_patch_varbase global _already_patch_varbase
...@@ -318,7 +257,15 @@ def monkey_patch_math_varbase(): ...@@ -318,7 +257,15 @@ def monkey_patch_math_varbase():
setattr(core.VarBase, method_name, method_impl) setattr(core.VarBase, method_name, method_impl)
else: else:
import paddle.tensor import paddle.tensor
for method_name in common_methods: # Tensor method from module paddle.tensor
tensor_methods = paddle.tensor.linalg.__all__ + \
paddle.tensor.math.__all__ + \
paddle.tensor.logic.__all__ + \
paddle.tensor.manipulation.__all__ + \
paddle.tensor.search.__all__ + \
paddle.tensor.stat.__all__ + \
paddle.tensor.attribute.__all__
for method_name in tensor_methods:
if hasattr(core.VarBase, method_name): continue if hasattr(core.VarBase, method_name): continue
method_impl = getattr(paddle.tensor, method_name, None) method_impl = getattr(paddle.tensor, method_name, None)
if method_impl: setattr(core.VarBase, method_name, method_impl) if method_impl: setattr(core.VarBase, method_name, method_impl)
......
...@@ -191,12 +191,14 @@ class FleetTranspiler(Fleet): ...@@ -191,12 +191,14 @@ class FleetTranspiler(Fleet):
self._communicator = Communicator( self._communicator = Communicator(
trainer_config.mode, kwargs, trainer_config.mode, kwargs,
trainer_config.get_communicator_flags()) trainer_config.get_communicator_flags())
self._communicator.init_with_ctx(send_ctx, recv_ctx) self._communicator.init_with_ctx(send_ctx, recv_ctx)
if not self._communicator.is_running(): if not self._communicator.is_running():
self._communicator.start() self._communicator.start()
else: else:
warnings.warn("communicator has been initialized, skip") raise ValueError(
"Communicator can only be inited once, please check")
def init_worker(self): def init_worker(self):
""" """
......
...@@ -624,6 +624,7 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False): ...@@ -624,6 +624,7 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
value_dims = [] value_dims = []
grad = None grad = None
opt_idx = -1 opt_idx = -1
fuse = False
for op in block.ops: for op in block.ops:
opt_idx += 1 opt_idx += 1
...@@ -631,6 +632,9 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False): ...@@ -631,6 +632,9 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
if op.type not in opt_value_map.keys(): if op.type not in opt_value_map.keys():
continue continue
if op.type in ["sgd", "adam"]:
fuse = True
grad = main_program.global_block().vars[op.input("Grad")[0]] grad = main_program.global_block().vars[op.input("Grad")[0]]
for value in opt_value_map[op.type]: for value in opt_value_map[op.type]:
...@@ -644,7 +648,67 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False): ...@@ -644,7 +648,67 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
if value_names: if value_names:
break break
return grad, opt_idx, value_names, value_dims, acture_names return grad, opt_idx, value_names, value_dims, acture_names, fuse
def add_fuse_large_scale_op(block, global_block, table_name, value_names,
acture_names, grad, is_entry, opt_idx):
op = block.ops[opt_idx]
if op.type == "sgd":
grad = main_program.global_block().vars[op.input("Grad")[0]]
lr = main_program.global_block().vars[op.input("LearningRate")[0]]
block._insert_op(
opt_idx,
type="lookup_sparse_table_fuse_sgd",
inputs={"Grad": grad,
"LearningRate": lr},
attrs={
"is_entry": is_entry,
"tablename": table_name,
"value_names": value_names
})
elif op.type == "adam":
grad = main_program.global_block().vars[op.input("Grad")[0]]
lr = main_program.global_block().vars[op.input("LearningRate")[0]]
beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[
0]]
beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[
0]]
beta1_pow_o = main_program.global_block().vars[op.output(
"Beta1PowOut")[0]]
beta2_pow_o = main_program.global_block().vars[op.output(
"Beta2PowOut")[0]]
beta1 = op.attr('beta1')
beta2 = op.attr('beta2')
epsilon = op.attr('epsilon')
block._insert_op(
opt_idx,
type="lookup_sparse_table_fuse_adam",
inputs={
"Grad": grad,
"LearningRate": lr,
"Beta1Pow": beta1_pow,
"Beta2Pow": beta2_pow
},
outputs={
"Beta1PowOut": beta1_pow_o,
"Beta2PowOut": beta2_pow_o
},
attrs={
"beta1": beta1,
"beta2": beta2,
"epsilon": epsilon,
"is_entry": is_entry,
"tablename": table_name,
"value_names": value_names
})
else:
raise ValueError("only support sgd/adam optimizer now")
def add_large_scale_op(block, global_block, table_name, value_names, def add_large_scale_op(block, global_block, table_name, value_names,
acture_names, grad, is_entry, opt_idx): acture_names, grad, is_entry, opt_idx):
...@@ -711,24 +775,35 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False): ...@@ -711,24 +775,35 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
for param, blockid in param_blockid_map.items(): for param, blockid in param_blockid_map.items():
opt_block = program.block(blockid) opt_block = program.block(blockid)
grad, opt_idx, value_names, value_dims, acture_names = \ grad, opt_idx, value_names, value_dims, acture_names, fuse = \
get_optimizer_values(opt_block) get_optimizer_values(opt_block)
entry_attr = get_entry_attr(param) entry_attr = get_entry_attr(param)
is_entry = False if entry_attr == "none" else True is_entry = False if entry_attr == "none" else True
add_large_scale_op(opt_block,
program.global_block(), param, value_names,
acture_names, grad, is_entry, opt_idx)
if fuse:
add_fuse_large_scale_op(opt_block,
program.global_block(), param,
value_names, acture_names, grad,
is_entry, opt_idx)
else:
add_large_scale_op(opt_block,
program.global_block(), param, value_names,
acture_names, grad, is_entry, opt_idx)
else: else:
large_scale_kv_metas = [] large_scale_kv_metas = []
for param, blockid in param_blockid_map.items(): for param, blockid in param_blockid_map.items():
opt_block = main_program.block(blockid) opt_block = main_program.block(blockid)
grad, _, value_names, value_dims, acture_names = \
grad, opt_idx, value_names, value_dims, acture_names, fuse = \
get_optimizer_values(opt_block) get_optimizer_values(opt_block)
entry_attr = get_entry_attr(param) entry_attr = get_entry_attr(param)
if fuse:
# remove origin optimzier op
opt_block._remove_op(opt_idx)
# training/infer # training/infer
mode = "0" mode = "0"
names_str = ",".join(value_names) names_str = ",".join(value_names)
......
...@@ -227,22 +227,6 @@ def init_from_server_pass(program, config): ...@@ -227,22 +227,6 @@ def init_from_server_pass(program, config):
fetch_barrier_out = program.global_block().create_var( fetch_barrier_out = program.global_block().create_var(
name=framework.generate_control_dev_var_name()) name=framework.generate_control_dev_var_name())
recv_ctx = config.get_communicator_recv_context(recv_type=1)
recv_varnames = []
for name, ctxs in recv_ctx.items():
recv_varnames.extend(ctxs.origin_varnames())
program.global_block().append_op(
type="recv",
inputs={"X": []},
outputs={"Out": []},
attrs={
"recv_varnames": recv_varnames,
"trainer_id": config.get_role_id(),
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
program.global_block().append_op( program.global_block().append_op(
type="fetch_barrier", type="fetch_barrier",
inputs={}, inputs={},
......
...@@ -164,8 +164,8 @@ def train(args): ...@@ -164,8 +164,8 @@ def train(args):
elif fleet.is_worker(): elif fleet.is_worker():
logger.info("run trainer") logger.info("run trainer")
fleet.init_worker()
exe.run(fleet.startup_program) exe.run(fleet.startup_program)
fleet.init_worker()
thread_num = 2 thread_num = 2
filelist = [] filelist = []
......
...@@ -54,29 +54,6 @@ EXPRESSION_MAP = { ...@@ -54,29 +54,6 @@ EXPRESSION_MAP = {
"__ge__": "A >= B" "__ge__": "A >= B"
} }
# method for Tensor from paddle.tensor
# edit it when paddle.tensor has new method about Tensor operation
common_methods = [
'exp', 'tanh', 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', 'cos',
'acos', 'asin', 'sin', 'sinh', 'cosh', 'round', 'reciprocal', 'square',
'rank', 'matmul', 'dot', 'norm', 'transpose', 'dist', 't', 'cross',
'cholesky', 'bmm', 'histogram', 'equal', 'greater_equal', 'greater_than',
'is_empty', 'isfinite', 'less_equal', 'less_than', 'logical_and',
'logical_not', 'logical_or', 'logical_xor', 'not_equal', 'reduce_all',
'reduce_any', 'allclose', 'equal_all', 'cast', 'expand', 'expand_as',
'tile', 'flatten', 'gather', 'gather_nd', 'reshape', 'reverse', 'scatter',
'scatter_nd_add', 'scatter_nd', 'shard_index', 'slice', 'split', 'squeeze',
'strided_slice', 'unique', 'unique_with_counts', 'unsqueeze', 'flip',
'unbind', 'roll', 'cumsum', 'increment', 'log', 'pow', 'reciprocal',
'round', 'rsqrt', 'scale', 'sign', 'stanh', 'sum', 'reduce_prod', 'max',
'min', 'mm', 'div', 'multiply', 'add', 'logsumexp', 'log1p', 'erf',
'addcmul', 'addmm', 'clamp', 'trace', 'kron', 'argmax', 'argmin', 'argsort',
'has_inf', 'has_nan', 'topk', 'index_select', 'nonzero', 'sort',
'index_sample', 'mean', 'std', 'var', 'elementwise_add', 'elementwise_div',
'elementwise_floordiv', 'elementwise_mod', 'elementwise_pow',
'elementwise_sub'
]
_already_patch_variable = False _already_patch_variable = False
...@@ -372,7 +349,14 @@ def monkey_patch_variable(): ...@@ -372,7 +349,14 @@ def monkey_patch_variable():
setattr(Variable, method_name, method_impl) setattr(Variable, method_name, method_impl)
else: else:
import paddle.tensor import paddle.tensor
for method_name in common_methods: variabel_methods = paddle.tensor.linalg.__all__ + \
paddle.tensor.math.__all__ + \
paddle.tensor.logic.__all__ + \
paddle.tensor.manipulation.__all__ + \
paddle.tensor.search.__all__ + \
paddle.tensor.stat.__all__ + \
paddle.tensor.attribute.__all__
for method_name in variabel_methods:
if hasattr(Variable, method_name): continue if hasattr(Variable, method_name): continue
method_impl = getattr(paddle.tensor, method_name, None) method_impl = getattr(paddle.tensor, method_name, None)
if method_impl: setattr(Variable, method_name, method_impl) if method_impl: setattr(Variable, method_name, method_impl)
......
...@@ -1453,11 +1453,14 @@ def linspace(start, stop, num, dtype=None, name=None): ...@@ -1453,11 +1453,14 @@ def linspace(start, stop, num, dtype=None, name=None):
if not isinstance(dtype, core.VarDesc.VarType): if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype) dtype = convert_np_dtype_to_dtype_(dtype)
if not isinstance(start, Variable): if not isinstance(start, Variable):
tensor_start = fill_constant([1], dtype, start) with device_guard("cpu"):
tensor_start = fill_constant([1], dtype, start)
if not isinstance(stop, Variable): if not isinstance(stop, Variable):
tensor_stop = fill_constant([1], dtype, stop) with device_guard("cpu"):
tensor_stop = fill_constant([1], dtype, stop)
if not isinstance(num, Variable): if not isinstance(num, Variable):
tensor_num = fill_constant([1], 'int32', num) with device_guard("cpu"):
tensor_num = fill_constant([1], 'int32', num)
if in_dygraph_mode(): if in_dygraph_mode():
return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype', return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
dtype) dtype)
......
...@@ -4,4 +4,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") ...@@ -4,4 +4,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test # default test
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
set_tests_properties(${src} PROPERTIES FIXTURES_SETUP ${src}_infer_model)
endforeach() endforeach()
...@@ -163,8 +163,10 @@ class TestDistCTR2x2(FleetDistRunnerBase): ...@@ -163,8 +163,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):
""" """
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
fleet.init_worker()
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fleet.init_worker()
batch_size = 4 batch_size = 4
train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size) train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
self.reader.decorate_sample_list_generator(train_reader) self.reader.decorate_sample_list_generator(train_reader)
...@@ -202,8 +204,8 @@ class TestDistCTR2x2(FleetDistRunnerBase): ...@@ -202,8 +204,8 @@ class TestDistCTR2x2(FleetDistRunnerBase):
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
fleet.init_worker()
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fleet.init_worker()
thread_num = 2 thread_num = 2
batch_size = 128 batch_size = 128
......
...@@ -60,8 +60,9 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2): ...@@ -60,8 +60,9 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
device_id = int(os.getenv("FLAGS_selected_gpus", "0")) device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
place = fluid.CUDAPlace(device_id) place = fluid.CUDAPlace(device_id)
exe = fluid.Executor(place) exe = fluid.Executor(place)
fleet.init_worker()
exe.run(fleet.startup_program) exe.run(fleet.startup_program)
fleet.init_worker()
batch_size = 4 batch_size = 4
train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size) train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
...@@ -104,8 +105,8 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2): ...@@ -104,8 +105,8 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
place = fluid.CUDAPlace(device_id) place = fluid.CUDAPlace(device_id)
exe = fluid.Executor(place) exe = fluid.Executor(place)
fleet.init_worker()
exe.run(fleet.startup_program) exe.run(fleet.startup_program)
fleet.init_worker()
thread_num = 2 thread_num = 2
batch_size = 128 batch_size = 128
......
...@@ -152,8 +152,9 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase): ...@@ -152,8 +152,9 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
""" """
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
fleet.init_worker()
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fleet.init_worker()
batch_size = 4 batch_size = 4
train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size) train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
self.reader.decorate_sample_list_generator(train_reader) self.reader.decorate_sample_list_generator(train_reader)
...@@ -176,8 +177,8 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase): ...@@ -176,8 +177,8 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
fleet.init_worker()
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fleet.init_worker()
thread_num = int(os.getenv("CPU_NUM", 2)) thread_num = int(os.getenv("CPU_NUM", 2))
batch_size = 128 batch_size = 128
......
...@@ -222,8 +222,8 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase): ...@@ -222,8 +222,8 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
""" """
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
fleet.init_worker()
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fleet.init_worker()
batch_size = 4 batch_size = 4
# reader # reader
train_reader = paddle.batch(fake_simnet_reader(), batch_size=batch_size) train_reader = paddle.batch(fake_simnet_reader(), batch_size=batch_size)
......
...@@ -151,8 +151,9 @@ class TestDistCTR2x2(FleetDistRunnerBase): ...@@ -151,8 +151,9 @@ class TestDistCTR2x2(FleetDistRunnerBase):
""" """
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
fleet.init_worker()
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fleet.init_worker()
batch_size = 4 batch_size = 4
......
...@@ -47,7 +47,7 @@ class TestSimpleRNNCell(unittest.TestCase): ...@@ -47,7 +47,7 @@ class TestSimpleRNNCell(unittest.TestCase):
prev_h = np.random.randn(4, 32) prev_h = np.random.randn(4, 32)
y1, h1 = rnn1(x, prev_h) y1, h1 = rnn1(x, prev_h)
y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
def test_with_zero_state(self): def test_with_zero_state(self):
...@@ -57,7 +57,7 @@ class TestSimpleRNNCell(unittest.TestCase): ...@@ -57,7 +57,7 @@ class TestSimpleRNNCell(unittest.TestCase):
x = np.random.randn(4, 16) x = np.random.randn(4, 16)
y1, h1 = rnn1(x) y1, h1 = rnn1(x)
y2, h2 = rnn2(paddle.to_variable(x)) y2, h2 = rnn2(paddle.to_tensor(x))
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
def runTest(self): def runTest(self):
...@@ -90,7 +90,7 @@ class TestGRUCell(unittest.TestCase): ...@@ -90,7 +90,7 @@ class TestGRUCell(unittest.TestCase):
prev_h = np.random.randn(4, 32) prev_h = np.random.randn(4, 32)
y1, h1 = rnn1(x, prev_h) y1, h1 = rnn1(x, prev_h)
y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
def test_with_zero_state(self): def test_with_zero_state(self):
...@@ -100,7 +100,7 @@ class TestGRUCell(unittest.TestCase): ...@@ -100,7 +100,7 @@ class TestGRUCell(unittest.TestCase):
x = np.random.randn(4, 16) x = np.random.randn(4, 16)
y1, h1 = rnn1(x) y1, h1 = rnn1(x)
y2, h2 = rnn2(paddle.to_variable(x)) y2, h2 = rnn2(paddle.to_tensor(x))
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
def runTest(self): def runTest(self):
...@@ -134,8 +134,8 @@ class TestLSTMCell(unittest.TestCase): ...@@ -134,8 +134,8 @@ class TestLSTMCell(unittest.TestCase):
y1, (h1, c1) = rnn1(x, (prev_h, prev_c)) y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
y2, (h2, c2) = rnn2( y2, (h2, c2) = rnn2(
paddle.to_variable(x), paddle.to_tensor(x),
(paddle.to_variable(prev_h), paddle.to_variable(prev_c))) (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -146,7 +146,7 @@ class TestLSTMCell(unittest.TestCase): ...@@ -146,7 +146,7 @@ class TestLSTMCell(unittest.TestCase):
x = np.random.randn(4, 16) x = np.random.randn(4, 16)
y1, (h1, c1) = rnn1(x) y1, (h1, c1) = rnn1(x)
y2, (h2, c2) = rnn2(paddle.to_variable(x)) y2, (h2, c2) = rnn2(paddle.to_tensor(x))
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
......
...@@ -53,7 +53,7 @@ class TestSimpleRNN(unittest.TestCase): ...@@ -53,7 +53,7 @@ class TestSimpleRNN(unittest.TestCase):
prev_h = np.random.randn(2 * self.num_directions, 4, 32) prev_h = np.random.randn(2 * self.num_directions, 4, 32)
y1, h1 = rnn1(x, prev_h) y1, h1 = rnn1(x, prev_h)
y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -66,7 +66,7 @@ class TestSimpleRNN(unittest.TestCase): ...@@ -66,7 +66,7 @@ class TestSimpleRNN(unittest.TestCase):
x = np.transpose(x, [1, 0, 2]) x = np.transpose(x, [1, 0, 2])
y1, h1 = rnn1(x) y1, h1 = rnn1(x)
y2, h2 = rnn2(paddle.to_variable(x)) y2, h2 = rnn2(paddle.to_tensor(x))
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -81,11 +81,11 @@ class TestSimpleRNN(unittest.TestCase): ...@@ -81,11 +81,11 @@ class TestSimpleRNN(unittest.TestCase):
y1, h1 = rnn1(x, sequence_length=sequence_length) y1, h1 = rnn1(x, sequence_length=sequence_length)
seq_len = paddle.to_variable(sequence_length) seq_len = paddle.to_tensor(sequence_length)
mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
if self.time_major: if self.time_major:
mask = paddle.transpose(mask, [1, 0]) mask = paddle.transpose(mask, [1, 0])
y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len) y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
y2 = paddle.multiply(y2, mask, axis=0) y2 = paddle.multiply(y2, mask, axis=0)
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -133,7 +133,7 @@ class TestGRU(unittest.TestCase): ...@@ -133,7 +133,7 @@ class TestGRU(unittest.TestCase):
prev_h = np.random.randn(2 * self.num_directions, 4, 32) prev_h = np.random.randn(2 * self.num_directions, 4, 32)
y1, h1 = rnn1(x, prev_h) y1, h1 = rnn1(x, prev_h)
y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -146,7 +146,7 @@ class TestGRU(unittest.TestCase): ...@@ -146,7 +146,7 @@ class TestGRU(unittest.TestCase):
x = np.transpose(x, [1, 0, 2]) x = np.transpose(x, [1, 0, 2])
y1, h1 = rnn1(x) y1, h1 = rnn1(x)
y2, h2 = rnn2(paddle.to_variable(x)) y2, h2 = rnn2(paddle.to_tensor(x))
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -161,11 +161,11 @@ class TestGRU(unittest.TestCase): ...@@ -161,11 +161,11 @@ class TestGRU(unittest.TestCase):
y1, h1 = rnn1(x, sequence_length=sequence_length) y1, h1 = rnn1(x, sequence_length=sequence_length)
seq_len = paddle.to_variable(sequence_length) seq_len = paddle.to_tensor(sequence_length)
mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
if self.time_major: if self.time_major:
mask = paddle.transpose(mask, [1, 0]) mask = paddle.transpose(mask, [1, 0])
y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len) y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
y2 = paddle.multiply(y2, mask, axis=0) y2 = paddle.multiply(y2, mask, axis=0)
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -209,8 +209,8 @@ class TestLSTM(unittest.TestCase): ...@@ -209,8 +209,8 @@ class TestLSTM(unittest.TestCase):
y1, (h1, c1) = rnn1(x, (prev_h, prev_c)) y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
y2, (h2, c2) = rnn2( y2, (h2, c2) = rnn2(
paddle.to_variable(x), paddle.to_tensor(x),
(paddle.to_variable(prev_h), paddle.to_variable(prev_c))) (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -224,7 +224,7 @@ class TestLSTM(unittest.TestCase): ...@@ -224,7 +224,7 @@ class TestLSTM(unittest.TestCase):
x = np.transpose(x, [1, 0, 2]) x = np.transpose(x, [1, 0, 2])
y1, (h1, c1) = rnn1(x) y1, (h1, c1) = rnn1(x)
y2, (h2, c2) = rnn2(paddle.to_variable(x)) y2, (h2, c2) = rnn2(paddle.to_tensor(x))
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
...@@ -240,11 +240,11 @@ class TestLSTM(unittest.TestCase): ...@@ -240,11 +240,11 @@ class TestLSTM(unittest.TestCase):
y1, (h1, c1) = rnn1(x, sequence_length=sequence_length) y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
seq_len = paddle.to_variable(sequence_length) seq_len = paddle.to_tensor(sequence_length)
mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
if self.time_major: if self.time_major:
mask = paddle.transpose(mask, [1, 0]) mask = paddle.transpose(mask, [1, 0])
y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len) y2, (h2, c2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
y2 = paddle.multiply(y2, mask, axis=0) y2 = paddle.multiply(y2, mask, axis=0)
np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
......
...@@ -30,11 +30,10 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu ...@@ -30,11 +30,10 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
class TestCommunicator(unittest.TestCase): class TestCommunicator(unittest.TestCase):
def net(self): def net(self):
x = fluid.layers.data(name='x', shape=[13], dtype='float32') x = fluid.layers.data(name='x', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y) cost = fluid.layers.square_error_cost(input=x, label=y)
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
return avg_cost return avg_cost
......
...@@ -83,8 +83,8 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase): ...@@ -83,8 +83,8 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
fleet.init_worker()
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fleet.init_worker()
train_reader = paddle.batch(self.fake_reader(), batch_size=24) train_reader = paddle.batch(self.fake_reader(), batch_size=24)
feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y]) feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])
......
...@@ -71,8 +71,8 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase): ...@@ -71,8 +71,8 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
fleet.init_worker()
exe.run(fleet.startup_program) exe.run(fleet.startup_program)
fleet.init_worker()
train_reader = paddle.batch(self.fake_reader(), batch_size=24) train_reader = paddle.batch(self.fake_reader(), batch_size=24)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
......
...@@ -27,11 +27,9 @@ import paddle.distributed.fleet as fleet ...@@ -27,11 +27,9 @@ import paddle.distributed.fleet as fleet
class TestCommunicator(unittest.TestCase): class TestCommunicator(unittest.TestCase):
def net(self): def net(self):
x = fluid.layers.data(name='x', shape=[13], dtype='float32') x = fluid.layers.data(name='x', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=x, label=y)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
return avg_cost return avg_cost
......
...@@ -119,6 +119,16 @@ class TestDiagV2API(unittest.TestCase): ...@@ -119,6 +119,16 @@ class TestDiagV2API(unittest.TestCase):
(n, n)) + np.diag(self.input_np3, self.offset) - np.diag( (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
self.padding_value * np.ones(n)) self.padding_value * np.ones(n))
self.input_np4 = np.random.random(size=(2000, 2000)).astype(np.float32)
self.expected6 = np.diag(self.input_np4)
self.expected7 = np.diag(self.input_np4, k=1)
self.expected8 = np.diag(self.input_np4, k=-1)
self.input_np5 = np.random.random(size=(2000)).astype(np.float32)
self.expected9 = np.diag(self.input_np5)
self.expected10 = np.diag(self.input_np5, k=1)
self.expected11 = np.diag(self.input_np5, k=-1)
def run_imperative(self): def run_imperative(self):
x = paddle.to_tensor(self.input_np) x = paddle.to_tensor(self.input_np)
y = paddle.diag(x) y = paddle.diag(x)
...@@ -141,10 +151,32 @@ class TestDiagV2API(unittest.TestCase): ...@@ -141,10 +151,32 @@ class TestDiagV2API(unittest.TestCase):
y = paddle.diag(x, padding_value=-8) y = paddle.diag(x, padding_value=-8)
self.assertTrue(np.allclose(y.numpy(), self.expected5)) self.assertTrue(np.allclose(y.numpy(), self.expected5))
x = paddle.to_tensor(self.input_np4)
y = paddle.diag(x)
self.assertTrue(np.allclose(y.numpy(), self.expected6))
y = paddle.diag(x, offset=1)
self.assertTrue(np.allclose(y.numpy(), self.expected7))
y = paddle.diag(x, offset=-1)
self.assertTrue(np.allclose(y.numpy(), self.expected8))
x = paddle.to_tensor(self.input_np5)
y = paddle.diag(x)
self.assertTrue(np.allclose(y.numpy(), self.expected9))
y = paddle.diag(x, offset=1)
self.assertTrue(np.allclose(y.numpy(), self.expected10))
y = paddle.diag(x, offset=-1)
self.assertTrue(np.allclose(y.numpy(), self.expected11))
def run_static(self, use_gpu=False): def run_static(self, use_gpu=False):
x = paddle.data(name='input', shape=[10, 10], dtype='float32') x = paddle.data(name='input', shape=[10, 10], dtype='float32')
x2 = paddle.data(name='input2', shape=[100], dtype='float64') x2 = paddle.data(name='input2', shape=[100], dtype='float64')
x3 = paddle.data(name='input3', shape=[100], dtype='int64') x3 = paddle.data(name='input3', shape=[100], dtype='int64')
x4 = paddle.data(name='input4', shape=[2000, 2000], dtype='float32')
x5 = paddle.data(name='input5', shape=[2000], dtype='float32')
result0 = paddle.diag(x) result0 = paddle.diag(x)
result1 = paddle.diag(x, offset=1) result1 = paddle.diag(x, offset=1)
result2 = paddle.diag(x, offset=-1) result2 = paddle.diag(x, offset=-1)
...@@ -152,17 +184,28 @@ class TestDiagV2API(unittest.TestCase): ...@@ -152,17 +184,28 @@ class TestDiagV2API(unittest.TestCase):
result4 = paddle.diag(x2, padding_value=8) result4 = paddle.diag(x2, padding_value=8)
result5 = paddle.diag(x3, padding_value=8.0) result5 = paddle.diag(x3, padding_value=8.0)
result6 = paddle.diag(x3, padding_value=-8) result6 = paddle.diag(x3, padding_value=-8)
result7 = paddle.diag(x4)
result8 = paddle.diag(x4, offset=1)
result9 = paddle.diag(x4, offset=-1)
result10 = paddle.diag(x5)
result11 = paddle.diag(x5, offset=1)
result12 = paddle.diag(x5, offset=-1)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
res0, res1, res2, res4, res5, res6 = exe.run( res0, res1, res2, res4, res5, res6, res7, res8, res9, res10, res11, res12 = exe.run(
feed={ feed={
"input": self.input_np, "input": self.input_np,
"input2": self.input_np2, "input2": self.input_np2,
'input3': self.input_np3 'input3': self.input_np3,
'input4': self.input_np4,
'input5': self.input_np5
}, },
fetch_list=[result0, result1, result2, result4, result5, result6]) fetch_list=[
result0, result1, result2, result4, result5, result6, result7,
result8, result9, result10, result11, result12
])
self.assertTrue(np.allclose(res0, self.expected0)) self.assertTrue(np.allclose(res0, self.expected0))
self.assertTrue(np.allclose(res1, self.expected1)) self.assertTrue(np.allclose(res1, self.expected1))
...@@ -171,6 +214,12 @@ class TestDiagV2API(unittest.TestCase): ...@@ -171,6 +214,12 @@ class TestDiagV2API(unittest.TestCase):
self.assertTrue(np.allclose(res4, self.expected3)) self.assertTrue(np.allclose(res4, self.expected3))
self.assertTrue(np.allclose(res5, self.expected4)) self.assertTrue(np.allclose(res5, self.expected4))
self.assertTrue(np.allclose(res6, self.expected5)) self.assertTrue(np.allclose(res6, self.expected5))
self.assertTrue(np.allclose(res7, self.expected6))
self.assertTrue(np.allclose(res8, self.expected7))
self.assertTrue(np.allclose(res9, self.expected8))
self.assertTrue(np.allclose(res10, self.expected9))
self.assertTrue(np.allclose(res11, self.expected10))
self.assertTrue(np.allclose(res12, self.expected11))
def test_cpu(self): def test_cpu(self):
paddle.disable_static(place=paddle.fluid.CPUPlace()) paddle.disable_static(place=paddle.fluid.CPUPlace())
......
...@@ -44,16 +44,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -44,16 +44,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
paddle.fluid.framework.switch_startup_program(startup_program) paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker()) fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
cost = paddle.fluid.layers.cross_entropy( avg_cost = paddle.fluid.layers.mean(cost)
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True strategy.a_sync = True
...@@ -71,7 +66,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -71,7 +66,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
sends += 1 sends += 1
if op.type == "sgd": if op.type == "sgd":
sgds += 1 sgds += 1
self.assertEqual(sends, 7) self.assertEqual(sends, 1)
self.assertEqual(sgds, 0) self.assertEqual(sgds, 0)
fleet.init_worker() fleet.init_worker()
...@@ -89,16 +84,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -89,16 +84,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
paddle.fluid.framework.switch_startup_program(startup_program) paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker()) fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32') x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') avg_cost = paddle.fluid.layers.mean(cost)
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True strategy.a_sync = True
......
...@@ -36,16 +36,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -36,16 +36,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
def test_gradient_merge_optimizer(self): def test_gradient_merge_optimizer(self):
fleet.init(role_maker.PaddleCloudRoleMaker()) fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
cost = paddle.fluid.layers.cross_entropy( avg_cost = paddle.fluid.layers.mean(cost)
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False strategy.a_sync = False
...@@ -63,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -63,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
sends += 1 sends += 1
if op.type == "sgd": if op.type == "sgd":
sgds += 1 sgds += 1
self.assertEqual(sends, 6) self.assertEqual(sends, 0)
self.assertEqual(sgds, 0) self.assertEqual(sgds, 0)
fleet.init_worker() fleet.init_worker()
......
...@@ -70,15 +70,13 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -70,15 +70,13 @@ class TestPSPassWithBow(unittest.TestCase):
q = fluid.layers.data( q = fluid.layers.data(
name="query_ids", shape=[1], dtype="int64", lod_level=1) name="query_ids", shape=[1], dtype="int64", lod_level=1)
# embedding # embedding
q_emb = fluid.layers.embedding( q_emb = fluid.contrib.layers.sparse_embedding(
input=q, input=q,
is_distributed=is_distributed,
size=[dict_dim, emb_dim], size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr))
is_sparse=is_sparse)
q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
# vsum # vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
...@@ -97,15 +95,13 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -97,15 +95,13 @@ class TestPSPassWithBow(unittest.TestCase):
pt = fluid.layers.data( pt = fluid.layers.data(
name="pos_title_ids", shape=[1], dtype="int64", lod_level=1) name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
# embedding # embedding
pt_emb = fluid.layers.embedding( pt_emb = fluid.contrib.layers.sparse_embedding(
input=pt, input=pt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim], size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr))
is_sparse=is_sparse)
pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
# vsum # vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
...@@ -123,15 +119,13 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -123,15 +119,13 @@ class TestPSPassWithBow(unittest.TestCase):
nt = fluid.layers.data( nt = fluid.layers.data(
name="neg_title_ids", shape=[1], dtype="int64", lod_level=1) name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
# embedding # embedding
nt_emb = fluid.layers.embedding( nt_emb = fluid.contrib.layers.sparse_embedding(
input=nt, input=nt,
is_distributed=is_distributed,
size=[dict_dim, emb_dim], size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01), initializer=fluid.initializer.Constant(value=0.01),
name="__emb__", name="__emb__",
learning_rate=emb_lr), learning_rate=emb_lr))
is_sparse=is_sparse)
nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
# vsum # vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
...@@ -167,7 +161,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -167,7 +161,7 @@ class TestPSPassWithBow(unittest.TestCase):
fleet.init(role) fleet.init(role)
loss, acc, _ = self.net() loss, acc, _ = self.net()
optimizer = fluid.optimizer.SGD(base_lr) optimizer = fluid.optimizer.Adam(base_lr)
strategy = StrategyFactory.create_async_strategy() strategy = StrategyFactory.create_async_strategy()
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(loss) optimizer.minimize(loss)
......
...@@ -168,12 +168,13 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -168,12 +168,13 @@ class TestPSPassWithBow(unittest.TestCase):
fleet.init(role) fleet.init(role)
loss, acc, _ = self.net() loss, acc, _ = self.net()
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.Adagrad(
learning_rate=fluid.layers.exponential_decay( learning_rate=fluid.layers.exponential_decay(
learning_rate=base_lr, learning_rate=base_lr,
decay_steps=500, decay_steps=500,
decay_rate=0.969, decay_rate=0.969,
staircase=True)) staircase=True))
strategy = StrategyFactory.create_async_strategy() strategy = StrategyFactory.create_async_strategy()
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(loss) optimizer.minimize(loss)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
# For Net
base_lr = 0.2
emb_lr = base_lr * 3
dict_dim = 1500
emb_dim = 128
hid_dim = 128
margin = 0.1
sample_rate = 1
batch_size = 4
class TestPSPassWithBow(unittest.TestCase):
def net(self):
def get_acc(cos_q_nt, cos_q_pt, batch_size):
cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
cond = fluid.layers.cast(cond, dtype='float64')
cond_3 = fluid.layers.reduce_sum(cond)
acc = fluid.layers.elementwise_div(
cond_3,
fluid.layers.fill_constant(
shape=[1], value=batch_size * 1.0, dtype='float64'),
name="simnet_acc")
return acc
def get_loss(cos_q_pt, cos_q_nt):
loss_op1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(
input=cos_q_pt,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_q_pt)
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
loss_op2)
avg_cost = fluid.layers.mean(loss_op3)
return avg_cost
is_distributed = False
is_sparse = True
# query
q = fluid.layers.data(
name="query_ids", shape=[1], dtype="int64", lod_level=1)
# embedding
q_emb = fluid.contrib.layers.sparse_embedding(
input=q,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr))
q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
# vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
q_ss = fluid.layers.softsign(q_sum)
# fc layer after conv
q_fc = fluid.layers.fc(
input=q_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__q_fc__",
learning_rate=base_lr))
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
# pt
pt = fluid.layers.data(
name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
# embedding
pt_emb = fluid.contrib.layers.sparse_embedding(
input=pt,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr))
pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
# vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
pt_ss = fluid.layers.softsign(pt_sum)
# fc layer
pt_fc = fluid.layers.fc(
input=pt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
# nt
nt = fluid.layers.data(
name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
# embedding
nt_emb = fluid.contrib.layers.sparse_embedding(
input=nt,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr))
nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
# vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
nt_ss = fluid.layers.softsign(nt_sum)
# fc layer
nt_fc = fluid.layers.fc(
input=nt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
# loss
avg_cost = get_loss(cos_q_pt, cos_q_nt)
# acc
acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
return [avg_cost, acc, cos_q_pt]
def test(self):
endpoints = [
"127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
"127.0.0.1:36007"
]
role = role_maker.UserDefinedRoleMaker(
current_id=0,
role=role_maker.Role.SERVER,
worker_num=2,
server_endpoints=endpoints)
fleet.init(role)
loss, acc, _ = self.net()
optimizer = fluid.optimizer.Adagrad(base_lr)
strategy = StrategyFactory.create_async_strategy()
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(loss)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
class TestLookupTableFuseOp(unittest.TestCase):
def test_fuse(self):
places = [core.CPUPlace()]
# currently only support CPU
for place in places:
self.check_with_place(place)
def check_with_place(self, place):
scope = fluid.global_scope()
scope.var("LearningRate").get_tensor().set([0.01], place)
scope.var("Ids").get_tensor().set([i for i in range(100)], place)
init_program = fluid.Program()
lr = init_program.global_block().create_var(
name="LearningRate",
persistable=True,
type=fluid.core.VarDesc.VarType.LOD_TENSOR,
shape=[1],
dtype="float32")
ids = init_program.global_block().create_var(
name="Ids",
persistable=True,
type=fluid.core.VarDesc.VarType.LOD_TENSOR,
shape=[100],
dtype="int64")
output = init_program.global_block().create_var(
name="output",
type=fluid.core.VarDesc.VarType.LOD_TENSOR,
shape=[100, 8],
dtype="float32")
metas = []
metas.append(
"embedding_1.block0:Param,Moment1,Moment2:8,8,8:0:embedding_1@GRAD.block0:embedding_1.block0,embedding_1_moment1_0,embedding_1_moment2_0,kSparseIDs@embedding_1.block0:uniform_random&0&-0.5&0.5,fill_constant&0.0,fill_constant&0.0:none"
)
metas.append(
"embedding_2.block0:Param:8:0:embedding_2@GRAD.block0:embedding_2.block0,kSparseIDs@embedding_2.block0:uniform_random&0&-0.5&0.5:none"
)
init_program.global_block().append_op(
type="lookup_sparse_table_init",
inputs=None,
outputs=None,
attrs={"large_scale_metas": metas})
init_program.global_block().append_op(
type="lookup_sparse_table_read",
inputs={"Ids": ids},
outputs={"Out": output},
attrs={
"tablename": "embedding_1.block0",
"init": True,
"value_names": ["Param"],
})
init_program.global_block().append_op(
type="lookup_sparse_table_read",
inputs={"Ids": ids},
outputs={"Out": output},
attrs={
"tablename": "embedding_2.block0",
"init": True,
"value_names": ["Param"],
})
executor = fluid.Executor(place)
executor.run(init_program)
training_program = fluid.Program()
scope.var('Beta1Pow').get_tensor().set(
np.array([0]).astype("float32"), place)
scope.var('Beta2Pow').get_tensor().set(
np.array([0]).astype("float32"), place)
rows = [0, 1, 2, 3, 4, 5, 6]
row_numel = 8
w_selected_rows = scope.var('Grad').get_selected_rows()
w_selected_rows.set_height(len(rows))
w_selected_rows.set_rows(rows)
w_array = np.ones((len(rows), row_numel)).astype("float32")
for i in range(len(rows)):
w_array[i] *= i
w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place)
lr = training_program.global_block().create_var(
name="LearningRate",
persistable=True,
type=fluid.core.VarDesc.VarType.LOD_TENSOR,
shape=[1],
dtype="float32")
grads = training_program.global_block().create_var(
name="Grad",
persistable=True,
type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
shape=[100, 8],
dtype="float32")
beta1 = training_program.global_block().create_var(
name="Beta1Pow",
persistable=True,
type=fluid.core.VarDesc.VarType.LOD_TENSOR,
shape=[1],
dtype="float32")
beta2 = training_program.global_block().create_var(
name="Beta2Pow",
persistable=True,
type=fluid.core.VarDesc.VarType.LOD_TENSOR,
shape=[1],
dtype="float32")
training_program.global_block().append_op(
type="lookup_sparse_table_fuse_adam",
inputs={
"Grad": grads,
"LearningRate": lr,
"Beta1Pow": beta1,
"Beta2Pow": beta2,
},
outputs={"Beta1PowOut": beta1,
"Beta2PowOut": beta2},
attrs={
"is_entry": False,
"tablename": "embedding_1.block0",
"value_names": ["Param", "Moment1", "Moment2"],
})
training_program.global_block().append_op(
type="lookup_sparse_table_fuse_sgd",
inputs={"Grad": grads,
"LearningRate": lr},
attrs={
"is_entry": False,
"tablename": "embedding_2.block0",
"value_names": ["Param"],
})
executor.run(training_program)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
@unittest.skipIf(not core.is_compiled_with_cuda(),
"Paddle core is not compiled with CUDA")
class TestFusedBnAddActAPI(unittest.TestCase):
def setUp(self):
self.conv_param_attr1 = fluid.ParamAttr(
name='conv2d_1.weight',
initializer=fluid.initializer.Xavier(uniform=False),
learning_rate=0.001)
self.conv_param_attr2 = fluid.ParamAttr(
name='conv2d_2.weight',
initializer=fluid.initializer.Xavier(uniform=False),
learning_rate=0.001)
self.bn_param_attr1 = fluid.ParamAttr(
name='batch_norm_w_1',
initializer=fluid.initializer.Constant(value=1.0))
self.bn_bias_attr1 = fluid.ParamAttr(
name='batch_norm_b_1',
initializer=fluid.initializer.Constant(value=0.0))
self.bn_param_attr2 = fluid.ParamAttr(
name='batch_norm_w_2',
initializer=fluid.initializer.Constant(value=1.0))
self.bn_bias_attr2 = fluid.ParamAttr(
name='batch_norm_b_2',
initializer=fluid.initializer.Constant(value=0.0))
self.fc_param_attr = fluid.ParamAttr(
name='fc.weight',
initializer=fluid.initializer.Xavier(uniform=False))
def build_fused_program(self,
main_program,
startup_program,
use_cuda,
seed=1):
with fluid.program_guard(main_program, startup_program):
x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
y = fluid.layers.data(name="y", shape=[1], dtype='int64')
conv1_1 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
stride=1,
padding=1,
act=None,
param_attr=self.conv_param_attr1,
bias_attr=False,
data_format='NHWC')
conv1_2 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
stride=1,
padding=1,
act=None,
param_attr=self.conv_param_attr2,
bias_attr=False,
data_format='NHWC')
bn = fluid.layers.batch_norm(
input=conv1_1,
param_attr=self.bn_param_attr1,
bias_attr=self.bn_bias_attr1,
act=None,
data_layout='NHWC')
fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(
conv1_2,
bn,
param_attr=self.bn_param_attr2,
bias_attr=self.bn_bias_attr2)
prediction = fluid.layers.fc(input=fused_bn_add_act,
size=10,
act='softmax',
param_attr=self.fc_param_attr)
loss = fluid.layers.cross_entropy(input=prediction, label=y)
loss = fluid.layers.mean(loss)
sgd = fluid.optimizer.SGD(learning_rate=0.001)
sgd = fluid.contrib.mixed_precision.decorate(
sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
sgd.minimize(loss)
return x, y, loss
def build_origin_program(self,
main_program,
startup_program,
use_cuda,
seed=1):
with fluid.program_guard(main_program, startup_program):
x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
y = fluid.layers.data(name="y", shape=[1], dtype='int64')
conv1_1 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
stride=1,
padding=1,
act=None,
param_attr=self.conv_param_attr1,
bias_attr=False,
data_format='NHWC')
conv1_2 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
stride=1,
padding=1,
act=None,
param_attr=self.conv_param_attr2,
bias_attr=False,
data_format='NHWC')
bn1 = fluid.layers.batch_norm(
input=conv1_1,
param_attr=self.bn_param_attr1,
bias_attr=self.bn_bias_attr1,
act=None,
data_layout='NHWC')
bn2 = fluid.layers.batch_norm(
input=conv1_2,
param_attr=self.bn_param_attr2,
bias_attr=self.bn_bias_attr2,
act=None,
data_layout='NHWC')
out = bn1 + bn2
out = fluid.layers.relu(out)
prediction = fluid.layers.fc(input=out,
size=10,
act='softmax',
param_attr=self.fc_param_attr)
loss = fluid.layers.cross_entropy(input=prediction, label=y)
loss = fluid.layers.mean(loss)
sgd = fluid.optimizer.SGD(learning_rate=0.001)
sgd = fluid.contrib.mixed_precision.decorate(
sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
sgd.minimize(loss)
return x, y, loss
def check(self, place, use_cuda):
paddle.manual_seed(1)
paddle.framework.random._manual_program_seed(1)
iters = 5
batch_size = 16
# build_fused_program
main_program = fluid.Program()
startup_program = fluid.Program()
x, y, loss = self.build_fused_program(main_program, startup_program,
use_cuda)
feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
exe = fluid.Executor(place)
loss_vals_fused = []
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe.run(startup_program)
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(main_program,
feed=feeder.feed(data),
fetch_list=[loss])
loss_vals_fused.append(loss_v[0][0])
# build_origin_program
main_program = fluid.Program()
startup_program = fluid.Program()
x, y, loss = self.build_origin_program(main_program, startup_program,
use_cuda)
feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
loss_vals = []
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe.run(startup_program)
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(main_program,
feed=feeder.feed(data),
fetch_list=[loss])
loss_vals.append(loss_v[0][0])
# check loss
for i in range(iters):
self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
def test_fuse_bn_add_act(self):
place = fluid.CUDAPlace(0)
self.check(place, use_cuda=True)
if __name__ == '__main__':
unittest.main()
...@@ -61,8 +61,8 @@ class ApiMinimumTest(unittest.TestCase): ...@@ -61,8 +61,8 @@ class ApiMinimumTest(unittest.TestCase):
def test_dynamic_api(self): def test_dynamic_api(self):
paddle.disable_static() paddle.disable_static()
np_x = np.array([10, 10]).astype('float64') np_x = np.array([10, 10]).astype('float64')
x = paddle.to_variable(self.input_x) x = paddle.to_tensor(self.input_x)
y = paddle.to_variable(self.input_y) y = paddle.to_tensor(self.input_y)
z = paddle.minimum(x, y) z = paddle.minimum(x, y)
np_z = z.numpy() np_z = z.numpy()
z_expected = np.array(np.minimum(self.input_x, self.input_y)) z_expected = np.array(np.minimum(self.input_x, self.input_y))
...@@ -73,8 +73,8 @@ class ApiMinimumTest(unittest.TestCase): ...@@ -73,8 +73,8 @@ class ApiMinimumTest(unittest.TestCase):
np_x = np.random.rand(5, 4, 3, 2).astype("float64") np_x = np.random.rand(5, 4, 3, 2).astype("float64")
np_y = np.random.rand(4, 3).astype("float64") np_y = np.random.rand(4, 3).astype("float64")
x = paddle.to_variable(self.input_x) x = paddle.to_tensor(self.input_x)
y = paddle.to_variable(self.input_y) y = paddle.to_tensor(self.input_y)
result_1 = paddle.minimum(x, y, axis=1) result_1 = paddle.minimum(x, y, axis=1)
result_2 = paddle.minimum(x, y, axis=-2) result_2 = paddle.minimum(x, y, axis=-2)
self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True) self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
...@@ -205,8 +205,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase): ...@@ -205,8 +205,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
paddle.disable_static() paddle.disable_static()
dy_ret = paddle.nn.functional.mse_loss( dy_ret = paddle.nn.functional.mse_loss(
paddle.to_variable(input_np), paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'mean')
paddle.to_variable(target_np), 'mean')
dy_result = dy_ret.numpy() dy_result = dy_ret.numpy()
sub = input_np - target_np sub = input_np - target_np
...@@ -240,8 +239,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase): ...@@ -240,8 +239,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
paddle.disable_static() paddle.disable_static()
dy_ret = paddle.nn.functional.mse_loss( dy_ret = paddle.nn.functional.mse_loss(
paddle.to_variable(input_np), paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'sum')
paddle.to_variable(target_np), 'sum')
dy_result = dy_ret.numpy() dy_result = dy_ret.numpy()
sub = input_np - target_np sub = input_np - target_np
...@@ -275,8 +273,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase): ...@@ -275,8 +273,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
paddle.disable_static() paddle.disable_static()
dy_ret = paddle.nn.functional.mse_loss( dy_ret = paddle.nn.functional.mse_loss(
paddle.to_variable(input_np), paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'none')
paddle.to_variable(target_np), 'none')
dy_result = dy_ret.numpy() dy_result = dy_ret.numpy()
sub = input_np - target_np sub = input_np - target_np
......
...@@ -909,8 +909,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase): ...@@ -909,8 +909,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
x_np = np.random.random(size=(5, )).astype(np.float64) x_np = np.random.random(size=(5, )).astype(np.float64)
label_np = np.random.randint(0, 10, size=(5, )).astype(np.int64) label_np = np.random.randint(0, 10, size=(5, )).astype(np.int64)
x = paddle.to_variable(x_np) x = paddle.to_tensor(x_np)
label = paddle.to_variable(label_np) label = paddle.to_tensor(label_np)
nll_loss = paddle.nn.loss.NLLLoss() nll_loss = paddle.nn.loss.NLLLoss()
res = nll_loss(x, label) res = nll_loss(x, label)
...@@ -933,8 +933,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase): ...@@ -933,8 +933,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
x_np = np.random.random(size=(5, 3)).astype(np.float64) x_np = np.random.random(size=(5, 3)).astype(np.float64)
label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64) label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
x = paddle.to_variable(x_np) x = paddle.to_tensor(x_np)
label = paddle.to_variable(label_np) label = paddle.to_tensor(label_np)
nll_loss = paddle.nn.loss.NLLLoss(reduction='') nll_loss = paddle.nn.loss.NLLLoss(reduction='')
res = nll_loss(x, label) res = nll_loss(x, label)
...@@ -957,8 +957,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase): ...@@ -957,8 +957,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
x_np = np.random.random(size=(5, 3)).astype(np.float64) x_np = np.random.random(size=(5, 3)).astype(np.float64)
label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64) label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
x = paddle.to_variable(x_np) x = paddle.to_tensor(x_np)
label = paddle.to_variable(label_np) label = paddle.to_tensor(label_np)
res = paddle.nn.functional.nll_loss(x, label, reduction='') res = paddle.nn.functional.nll_loss(x, label, reduction='')
self.assertRaises( self.assertRaises(
......
...@@ -101,9 +101,9 @@ def create_test_case(margin, reduction): ...@@ -101,9 +101,9 @@ def create_test_case(margin, reduction):
def run_dynamic_functional_api(self, place): def run_dynamic_functional_api(self, place):
paddle.disable_static(place) paddle.disable_static(place)
x = paddle.to_variable(self.x_data) x = paddle.to_tensor(self.x_data)
y = paddle.to_variable(self.y_data) y = paddle.to_tensor(self.y_data)
label = paddle.to_variable(self.label_data) label = paddle.to_tensor(self.label_data)
result = paddle.nn.functional.margin_ranking_loss(x, y, label, result = paddle.nn.functional.margin_ranking_loss(x, y, label,
margin, reduction) margin, reduction)
...@@ -117,9 +117,9 @@ def create_test_case(margin, reduction): ...@@ -117,9 +117,9 @@ def create_test_case(margin, reduction):
def run_dynamic_api(self, place): def run_dynamic_api(self, place):
paddle.disable_static(place) paddle.disable_static(place)
x = paddle.to_variable(self.x_data) x = paddle.to_tensor(self.x_data)
y = paddle.to_variable(self.y_data) y = paddle.to_tensor(self.y_data)
label = paddle.to_variable(self.label_data) label = paddle.to_tensor(self.label_data)
margin_rank_loss = paddle.nn.loss.MarginRankingLoss( margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
margin=margin, reduction=reduction) margin=margin, reduction=reduction)
result = margin_rank_loss(x, y, label) result = margin_rank_loss(x, y, label)
...@@ -134,9 +134,9 @@ def create_test_case(margin, reduction): ...@@ -134,9 +134,9 @@ def create_test_case(margin, reduction):
def run_dynamic_broadcast_api(self, place): def run_dynamic_broadcast_api(self, place):
paddle.disable_static(place) paddle.disable_static(place)
label_data = np.random.choice([-1, 1], size=[10]).astype("float64") label_data = np.random.choice([-1, 1], size=[10]).astype("float64")
x = paddle.to_variable(self.x_data) x = paddle.to_tensor(self.x_data)
y = paddle.to_variable(self.y_data) y = paddle.to_tensor(self.y_data)
label = paddle.to_variable(label_data) label = paddle.to_tensor(label_data)
margin_rank_loss = paddle.nn.loss.MarginRankingLoss( margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
margin=margin, reduction=reduction) margin=margin, reduction=reduction)
result = margin_rank_loss(x, y, label) result = margin_rank_loss(x, y, label)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册