提交 03a2be6c 编写于 作者: M MrChengmo

for merge

......@@ -63,7 +63,28 @@ if(WIN32)
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif()
endforeach(flag_var)
endif()
# windows build turn off warnings.
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
......
......@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG CRYPTOPP_8_2_0)
IF(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
ELSE(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32)
set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
......@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
-DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
-DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
......@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
endif()
endmacro()
macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
if (BUILD_SHARED_LIBS)
return() # if build shared libs, the flags keep same with '/MD'
endif(BUILD_SHARED_LIBS)
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endmacro()
CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
if(NOT UINT64_MAX_EXISTS)
......@@ -229,20 +215,3 @@ endforeach()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
if(WIN32)
# windows build turn off warnings.
if(MSVC_STATIC_CRT)
safe_set_static_flag()
endif()
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var)
endif()
......@@ -446,6 +446,9 @@ function(nv_library TARGET_NAME)
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(nv_library_SRCS)
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif()
endfunction(nv_library)
......@@ -461,6 +464,9 @@ function(nv_binary TARGET_NAME)
add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
common_link(${TARGET_NAME})
endif()
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif()
endfunction(nv_binary)
......@@ -482,6 +488,9 @@ function(nv_test TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif()
endfunction(nv_test)
......@@ -712,6 +721,7 @@ function(proto_library TARGET_NAME)
set(proto_hdrs)
paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
add_dependencies(extern_xxhash ${TARGET_NAME})
endfunction()
function(py_proto_compile TARGET_NAME)
......
......@@ -19,12 +19,11 @@ set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
"A path setting paddle inference shared and static libraries")
# TODO(zhaolong)
# At present, the size of static lib in Windows exceeds the system limit,
# so the generation of static lib is temporarily turned off.
# At present, the size of static lib in Windows is very large,
# so we need to crop the library size.
if(WIN32)
#todo: remove the option
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF)
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic." OFF)
if(NOT PYTHON_EXECUTABLE)
FIND_PACKAGE(PythonInterp REQUIRED)
endif()
......@@ -165,25 +164,22 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32)
if(WITH_STATIC_LIB)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
else()
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
endif()
else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
endif(WIN32)
if(WIN32 AND NOT WITH_STATIC_LIB)
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
else()
else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
endif()
endif(WIN32)
copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
......@@ -199,7 +195,11 @@ set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_insta
copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
if(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
else(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
endif(WIN32)
copy(inference_lib_dist
SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_fluid_c_lib}
......@@ -211,12 +211,12 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
set(module "inference")
if(WIN32 AND NOT WITH_STATIC_LIB)
if(WIN32)
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
else()
else()
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
......
......@@ -26,4 +26,7 @@ if(WITH_GPU)
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
endif()
if(WIN32)
set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
endif()
......@@ -127,7 +127,8 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
"fused_bn_add_activation_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......
<?xml version="1.0" encoding="utf-8"?>
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemDefinitionGroup>
<CudaCompile>
<!-- Project schema: Host properties -->
<UseHostDefines>true</UseHostDefines>
<Emulation>false</Emulation>
<HostDebugInfo Condition="'$(Configuration)' == 'Debug'">true</HostDebugInfo>
<HostDebugInfo Condition="'$(Configuration)' != 'Debug'">false</HostDebugInfo>
<FastMath>false</FastMath>
<Optimization>InheritFromHost</Optimization>
<Runtime>InheritFromHost</Runtime>
<RuntimeChecks>InheritFromHost</RuntimeChecks>
<TypeInfo>InheritFromHost</TypeInfo>
<Warning>InheritFromHost</Warning>
<BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
<BuildCommandLineTemplate>--use-local-env</BuildCommandLineTemplate>
<BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
<CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
<!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
<HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
<DriverApiCommandLineTemplate>%(BaseCommandLineTemplate) [CompileOut] "%(FullPath)"</DriverApiCommandLineTemplate>
<RuntimeApiCommandLineTemplate>%(BaseCommandLineTemplate) [HostDebugInfo] [Emulation] [FastMath] [Defines] %(HostCommandLineTemplate) [CompileOut] "%(FullPath)"</RuntimeApiCommandLineTemplate>
<CommandLineTemplate>
# (Approximate command-line. Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)
# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(DriverApiCommandLineTemplate)
# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(RuntimeApiCommandLineTemplate)
</CommandLineTemplate>
<ExecutionDescription>Compiling CUDA source file %(Identity)...</ExecutionDescription>
<ExclusionDescription>Skipping CUDA source file %(Identity) (excluded from build).</ExclusionDescription>
<!-- Miscellaneous -->
<PropsCacheOutputFile>%(Filename)%(Extension).cache</PropsCacheOutputFile>
<PropsCacheOutputPath>$(IntDir)%(PropsCacheOutputFile)</PropsCacheOutputPath>
<CudaCompileCoreProject>$(MSBuildProjectFullPath)</CudaCompileCoreProject>
</CudaCompile>
<CudaLink>
<PerformDeviceLink>true</PerformDeviceLink>
<LinkOut>$(IntDir)$(TargetName).device-link.obj</LinkOut>
<AdditionalLibraryDirectories></AdditionalLibraryDirectories>
<UseHostLibraryDirectories>true</UseHostLibraryDirectories>
<AdditionalDependencies></AdditionalDependencies>
<UseHostLibraryDependencies>true</UseHostLibraryDependencies>
<GPUDebugInfo>InheritFromProject</GPUDebugInfo>
<Optimization>InheritFromProject</Optimization>
<!-- Implicitly inherited from the project via @(CudaCompile) -->
<CodeGeneration></CodeGeneration>
<RuntimeChecks></RuntimeChecks>
<Runtime></Runtime>
<TargetMachinePlatform></TargetMachinePlatform>
<TypeInfo></TypeInfo>
<Warning></Warning>
<Inputs></Inputs>
<!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
<HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
<LinkCommandLineTemplate>"$(CudaToolkitNvccPath)" -dlink [LinkOut] %(HostCommandLineTemplate) [AdditionalLibraryDirectories] [AdditionalDependencies] [AdditionalOptions] [CodeGeneration] [GPUDebugInfo] [TargetMachinePlatform] [Inputs]</LinkCommandLineTemplate>
<CommandLineTemplate>
# (Approximate command-line. Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)
%(LinkCommandLineTemplate)
</CommandLineTemplate>
</CudaLink>
<Link>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
</Link>
<ClCompile>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
</ClCompile>
</ItemDefinitionGroup>
</Project>
......@@ -39,6 +39,7 @@ set(third_party_deps)
# REPOSITORY ${TARGET_REPOSITORY}
# TAG ${TARGET_TAG}
# DIR ${TARGET_SOURCE_DIR})
FUNCTION(cache_third_party TARGET)
SET(options "")
SET(oneValueArgs URL REPOSITORY TAG DIR)
......@@ -269,6 +270,10 @@ if(WITH_PSLIB)
endif()
endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS)
include(external/box_ps)
......@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc)
......
......@@ -14,8 +14,6 @@ limitations under the License. */
#include "paddle/fluid/framework/attribute.h"
#include <vector>
namespace paddle {
namespace framework {
......
......@@ -30,6 +30,8 @@ namespace paddle {
namespace framework {
class ProgramDesc;
class OpDesc;
class VarDesc;
// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
// read/write speed. Only when we want the protobuf message, the local changes
......
......@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/c/c_api.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/init.h"
extern "C" {
......
......@@ -24,6 +24,15 @@ limitations under the License. */
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class OpInfoMap;
} // namespace framework
namespace platform {
class DeviceContextPool;
} // namespace platform
} // namespace paddle
#ifdef __cplusplus
extern "C" {
#endif
......
......@@ -277,7 +277,7 @@ class ChannelObject {
size_t finished = 0;
while (finished < n && WaitForWrite(lock)) {
size_t m =
std::min(n - finished, capacity_ + reading_count_ - data_.size());
(std::min)(n - finished, capacity_ + reading_count_ - data_.size());
for (size_t i = 0; i < m; i++) {
data_.push_back(std::move(p[finished++]));
}
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include <random>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
......
......@@ -21,6 +21,8 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Tensor;
void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
Tensor* out);
......
......@@ -41,6 +41,15 @@ limitations under the License. */
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace framework {
class DataFeedDesc;
class LoDTensor;
class Scope;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
......@@ -418,6 +427,7 @@ class MultiSlotType {
std::string DebugString() {
std::stringstream ss;
ss << "\ntype: " << type_ << "\n";
ss << "offset: ";
ss << "[";
......
......@@ -17,10 +17,10 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
class DataFeed;
typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
data_feedMap g_data_feed_map;
......
......@@ -16,10 +16,13 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
class DataFeed;
class DataFeedFactory {
public:
static std::string DataFeedTypeList();
......
......@@ -13,8 +13,8 @@
// limitations under the License.
#include "paddle/fluid/framework/data_layout_transform.h"
#include <string>
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -17,10 +17,18 @@
#include <map>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
} // namespace framework
} // namespace paddle
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
......
......@@ -15,7 +15,6 @@
#include "paddle/fluid/framework/data_layout_transform.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
TEST(DataTransform, DataLayoutFunction) {
auto place = paddle::platform::CPUPlace();
......
......@@ -18,8 +18,13 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_type_transform.h"
namespace paddle {
namespace framework {
class Variable;
} // namespace framework
} // namespace paddle
#ifdef PADDLE_WITH_MKLDNN
#include <algorithm>
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
......
......@@ -30,6 +30,10 @@ limitations under the License. */
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
class Variable;
void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType &kernel_type_for_var,
const Tensor &input_tensor, Tensor *out);
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#include "paddle/fluid/framework/data_type.h"
#include <stdint.h>
#include <string>
#include <unordered_map>
......
......@@ -15,12 +15,19 @@ limitations under the License. */
#pragma once
#include <string>
#include <typeindex>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -14,9 +14,17 @@
#include "paddle/fluid/framework/data_type.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
TEST(DataType, float16) {
using paddle::framework::Tensor;
using paddle::platform::CPUPlace;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <utility>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
......@@ -23,6 +24,9 @@ limitations under the License. */
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
void TransDataType(const OpKernelType& kernel_type_for_var,
......
......@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/dataset_factory.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_set.h"
namespace paddle {
namespace framework {
typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)();
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/data_set.h"
namespace paddle {
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <stdexcept>
#include <string>
#include <vector>
#include "paddle/fluid/framework/dim.h"
namespace paddle {
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sstream>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/ddim.h"
......
......@@ -20,6 +20,17 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
......
......@@ -24,6 +24,20 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct VarHandle;
} // namespace details
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -21,13 +21,15 @@
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct DummyVarHandle;
struct VarHandle;
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -27,6 +27,18 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
class Graph;
class PassBuilder;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -19,6 +19,8 @@
namespace paddle {
namespace framework {
namespace details {
struct VarHandleBase;
ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
platform::Place place,
size_t scope_idx)
......
......@@ -24,9 +24,21 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class OperatorBase;
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
struct VarHandleBase;
class ComputationOpHandle : public OpHandleBase {
public:
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
......
......@@ -19,12 +19,23 @@
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
class Scope;
class GarbageCollector;
namespace ir {
class Node;
} // namespace ir
namespace ir {
class MemOptVarInfo;
......
......@@ -13,8 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/exception_holder.h"
#include <memory>
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator.h"
......
......@@ -15,9 +15,14 @@
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -22,6 +22,18 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class LoDTensor;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -19,6 +19,8 @@
namespace paddle {
namespace framework {
namespace details {
struct VarHandleBase;
FetchBarrierOpHandle::FetchBarrierOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
......
......@@ -24,6 +24,15 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......@@ -32,6 +41,8 @@ namespace details {
// all places if there are multiple places, must init with
// multiple dev_ctxes_ !!!!
struct VarHandleBase;
struct FetchBarrierOpHandle : public OpHandleBase {
public:
FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
......
......@@ -22,6 +22,17 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -17,10 +17,22 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
......
......@@ -25,6 +25,17 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -17,11 +17,20 @@
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
struct VarHandle;
struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::string> out_varnames_;
std::vector<std::unique_ptr<ir::Node>> nodes_;
......
......@@ -24,6 +24,14 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -17,11 +17,11 @@
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct DummyVarHandle;
namespace f = paddle::framework;
namespace p = paddle::platform;
......
......@@ -20,16 +20,21 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
class OpDesc;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -19,6 +19,12 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -24,10 +24,22 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
class Scope;
namespace details {
struct VarHandleBase;
} // namespace details
namespace ir {
class Node;
} // namespace ir
namespace details {
......
......@@ -24,6 +24,21 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class SelectedRows;
namespace details {
struct VarHandle;
} // namespace details
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -24,6 +24,16 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class OpDesc;
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -13,9 +13,17 @@
// limitations under the License.
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include <string>
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -21,6 +21,18 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -17,7 +17,9 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace details {
......
......@@ -22,6 +22,17 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
class Scope;
class Tensor;
class Variable;
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -25,6 +25,15 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -23,10 +23,20 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
class ComputationOpHandle;
ComputationOpHandle *GetUniquePendingComputationOpHandle(
ShareTensorBufferOpHandle *share_tensor_op) {
ComputationOpHandle *result_op = nullptr;
......
......@@ -22,10 +22,22 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class MemOptVarInfo;
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
class ComputationOpHandle;
class ShareTensorBufferOpHandle : public OpHandleBase {
public:
ShareTensorBufferOpHandle(
......
......@@ -23,6 +23,17 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -24,6 +24,14 @@
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -15,6 +15,14 @@
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/selected_rows.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -17,6 +17,13 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "xxhash.h" // NOLINT
namespace paddle {
namespace framework {
class LoDTensor;
class Scope;
void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
......
......@@ -39,6 +39,18 @@ limitations under the License. */
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
class LoDTensor;
class ProgramDesc;
class Scope;
class Tensor;
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -62,7 +74,9 @@ class PullDenseWorker {
virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
void AddPlace(const paddle::platform::Place place) {
places_.push_back(place);
}
......@@ -123,9 +137,9 @@ class PullDenseWorker {
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
#endif
std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_;
#endif
};
// should incorporate different type of device
......@@ -149,6 +163,7 @@ class DeviceWorker {
virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void GetXpuOpIndex() {}
virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field;
}
......
......@@ -20,6 +20,8 @@ limitations under the License. */
namespace paddle {
namespace framework {
class DeviceWorker;
typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)();
typedef std::unordered_map<std::string, Createdevice_workerFunction>
device_workerMap;
......
......@@ -16,11 +16,14 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/device_worker.h"
namespace paddle {
namespace framework {
class DeviceWorker;
class DeviceWorkerFactory {
public:
static std::string DeviceWorkerTypeList();
......
......@@ -13,9 +13,10 @@
// limitations under the License.
#include "paddle/fluid/framework/device_worker.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/trainer.h"
namespace paddle {
namespace framework {
......
......@@ -128,6 +128,7 @@ message DistributedStrategy {
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional bool adaptive_localsgd = 24 [ default = false ];
optional bool fp16_allreduce = 25 [ default = false ];
optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
......
......@@ -11,10 +11,17 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/dlpack_tensor.h"
#include <unordered_map>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/dlpack_tensor.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -15,11 +15,14 @@
#pragma once
#include <dlpack/dlpack.h>
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace framework {
class Tensor;
class DLPackTensor {
public:
using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t
......
......@@ -17,6 +17,12 @@
#include <gtest/gtest.h>
#include <vector>
namespace paddle {
namespace platform {
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
} // namespace framework
} // namespace paddle
#if defined _WIN32 || defined __APPLE__
#else
......
......@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <set>
#include <unordered_map>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/lodtensor_printer.h"
namespace paddle {
namespace framework {
class OpDesc;
class OperatorBase;
class ProgramDesc;
bool HasDependentOutput(const OpDesc& op_desc,
const std::unordered_set<std::string>& dependent_vars) {
for (auto& var : op_desc.Outputs()) {
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include "paddle/fluid/framework/tensor.h"
#include "unsupported/Eigen/CXX11/Tensor"
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/garbage_collector.h"
......@@ -32,6 +33,11 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Dataset;
class ProgramDesc;
class Scope;
class TrainerBase;
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
......
......@@ -13,16 +13,19 @@
// limitations under the License.
#include "paddle/fluid/framework/executor_gc_helper.h"
#include <deque>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......
......@@ -18,6 +18,7 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
......@@ -26,6 +27,10 @@ namespace paddle {
namespace framework {
// Result map: op -> variable names that can be deleted after op runs
class GarbageCollector;
class OperatorBase;
class Scope;
std::unordered_map<const OperatorBase *, std::vector<std::string>>
GetUnusedVars(const BlockDesc &block,
const std::vector<std::unique_ptr<OperatorBase>> &ops,
......
......@@ -13,16 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
......
......@@ -15,12 +15,16 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Scope;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index);
......
......@@ -29,12 +29,6 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
......@@ -751,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
push_sparse_status->push_back(std::move(status));
}
}
#endif
#ifdef PADDLE_WITH_XPU
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place) {
#ifdef PADDLE_WITH_PSLIB
std::vector<paddle::ps::Region> regions;
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int count = tensor->numel();
float* g_data = tensor->data<float>();
Variable* pin_var = scope.FindVar(t + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g =
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
memory::Copy(platform::CPUPlace(), pin_g,
BOOST_GET_CONST(platform::XPUPlace, place), g_data,
sizeof(float) * count);
float* g = pin_g;
if (scale_datanorm >= 0) {
if (t.find(".batch_size@GRAD") != std::string::npos ||
t.find(".batch_sum@GRAD") != std::string::npos) {
Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
float scale = 1.0 / batch_size;
mat *= scale;
} else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
VLOG(3) << "epsilon: " << scale_datanorm;
for (int i = 0; i < count; ++i) {
g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
batch_size * scale_datanorm;
}
}
}
paddle::ps::Region reg(g, count);
regions.emplace_back(std::move(reg));
}
auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
regions.size(), table_id);
if (push_sparse_status) {
push_sparse_status->push_back(std::move(status));
}
#endif
}
#endif
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
......
......@@ -35,6 +35,12 @@ limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
......@@ -154,6 +160,14 @@ class FleetWrapper {
float scale_datanorm, int batch_size,
const paddle::platform::Place& place, cudaStream_t stream,
cudaEvent_t event);
#endif
#ifdef PADDLE_WITH_XPU
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place);
#endif
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
......
......@@ -10,10 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/string/string_helper.h"
namespace gloo {
......
......@@ -27,15 +27,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/timer.h"
#ifdef PADDLE_WITH_PSLIB
namespace paddle {
......@@ -122,29 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
if (platform::is_cpu_place(tensor->place())) {
memcpy(data_ptr, tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
}
} else {
#ifdef PADDLE_WITH_CUDA
else {
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()), nullptr);
}
#endif
#ifdef PADDLE_WITH_XPU
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
}
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place,
cudaStream_t stream) {
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
std::vector<int> vec_dim;
for (auto& x : req_var.dims()) {
vec_dim.push_back(x);
}
tensor->Resize(make_ddim(vec_dim));
LoD lod;
for (int i = 0; i < req_var.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
v.push_back(req_var.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
#else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
#endif
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place) {
#endif
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
......@@ -168,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
#ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
tensor->numel() * SizeOfType(tensor->type()));
#else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
......@@ -192,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
case VariableMessage::BOOL:
return framework::proto::VarType::BOOL; // NOLINT
default:
VLOG(0) << "Not support type " << type;
PADDLE_THROW(platform::errors::InvalidArgument(
"ToVarType:Unsupported type %d", type));
}
}
......@@ -239,7 +268,7 @@ void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
request.set_cur_batch(task->cur_batch_);
OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
auto* closure = (OnHeterRpcDone*)done;
auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
if (closure->cntl.Failed()) {
VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
} else {
......
......@@ -13,9 +13,6 @@
// limitations under the License.
#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
#include <utility>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include <random>
#include <string>
#include <vector>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h"
......@@ -29,6 +30,12 @@ limitations under the License. */
#endif
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -19,9 +19,16 @@
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -16,7 +16,6 @@ limitations under the License. */
#include <glog/logging.h>
#include <stdint.h>
#include <atomic>
#include <deque>
#include <iostream> // temp for debug
......
......@@ -12,13 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
#ifdef PADDLE_WITH_PSLIB
#if defined _WIN32 || defined __APPLE__
......
......@@ -22,9 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace paddle {
namespace framework {
......@@ -44,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
int place_num = trainer_desc.worker_places_size();
for (int i = 0; i < place_num; ++i) {
int num = trainer_desc.worker_places(i);
#ifdef PADDLE_WITH_CUDA
platform::CUDAPlace place = platform::CUDAPlace(num);
platform::CUDADeviceGuard guard(place.device);
cudaStream_t stream;
......@@ -54,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
events_.push_back(event);
#endif
#ifdef PADDLE_WITH_XPU
platform::XPUPlace place = platform::XPUPlace(num);
places_.push_back(place);
#endif
}
// thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset);
......@@ -105,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
auto place = places_[num];
Scope* scope = place_scopes_[num];
#ifdef PADDLE_WITH_CUDA
auto stream = copy_streams_[num];
auto event = events_[num];
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
#endif
#ifdef PADDLE_WITH_XPU
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
#endif
auto& block = program.Block(0);
for (auto& var : block.AllVars()) {
if (var->Persistable()) {
......@@ -126,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \
} while (0)
#define HeterMemcpyXpuFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
} \
} while (0)
#ifdef PADDLE_WITH_CUDA
_ForEachDataType_(HeterMemcpyFunc);
#endif
#ifdef PADDLE_WITH_XPU
_ForEachDataType_(HeterMemcpyXpuFunc);
#endif
}
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event);
#endif
}
#ifdef PADDLE_WITH_CUDA
template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor,
......@@ -151,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
root_ptr, sizeof(T) * root_tensor->numel(), stream);
}
}
#endif
#ifdef PADDLE_WITH_XPU
template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor,
const paddle::platform::Place& thread_place) {
T* thread_ptr =
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel());
} else {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
root_ptr, sizeof(T) * root_tensor->numel());
}
}
#endif
void HeterXpuTrainer::DumpWork(int tid) {}
......@@ -181,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
CreateThreadParam(main_program, i);
pull_dense_worker_->AddThreadScope(scope);
pull_dense_worker_->AddPlace(places_[i]);
#ifdef PADDLE_WITH_CUDA
pull_dense_worker_->AddStream(copy_streams_[i]);
#endif
}
pull_dense_worker_->Start();
#ifdef PADDLE_WITH_CUDA
for (auto& stream : copy_streams_) {
cudaStreamSynchronize(stream);
}
#endif
op_names_.clear();
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
......@@ -230,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr);
}
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
#endif
object_pool_.Push(context);
}
}
......@@ -277,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
} \
} while (0)
_ForEachDataType_(MergeCallback);
if (platform::is_gpu_place(thread_tensor->place())) {
if (!platform::is_cpu_place(thread_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = thread_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
const platform::XPUDeviceContext* xpu_ctx =
reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
#endif
} else {
memset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
......@@ -291,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
auto* merge_var = response->add_vars();
heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
merge_var);
if (platform::is_gpu_place(root_tensor->place())) {
if (!platform::is_cpu_place(root_tensor->place())) {
#ifdef PADDLE_WITH_CUDA
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif
#ifdef PADDLE_WITH_XPU
auto place = root_tensor->place();
xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::DeviceContext* dev_ctx = pool.Get(place);
const platform::XPUDeviceContext* xpu_ctx =
reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
#endif
} else {
memset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
......@@ -334,7 +415,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
if (!context->scope_) {
int num = rand() % places_.size();
int num = rand_r() % places_.size();
context->place_num_ = num;
auto place = places_[num];
context->scope_ = &(place_scopes_[num]->NewScope());
......@@ -356,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr);
}
#ifdef PADDLE_WITH_CUDA
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
#endif
}
context->Reset();
......@@ -369,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
auto deserial_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
for (int i = 0; i < request->vars_size(); ++i) {
#ifdef PADDLE_WITH_CUDA
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
copy_streams_[context->place_num_]);
#endif
#ifdef PADDLE_WITH_XPU
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
#endif
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
while (cudaEventQuery(context->event_) != cudaSuccess) {
VLOG(3) << "wait for kernel";
bthread_yield();
}
#endif
}
{
......@@ -388,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
op->Run(*(context->scope_), place);
}
}
#ifdef PADDLE_WITH_CUDA
auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_CUDA_SUCCESS(
......@@ -401,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
bthread_yield();
}
}
#endif
#ifdef PADDLE_WITH_XPU
xpu_wait();
#endif
for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
const std::string& varname = trainer_desc_.xpu_send_list(i);
......@@ -417,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
++i) {
uint64_t tid =
static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
#ifdef PADDLE_WITH_CUDA
fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_], copy_streams_[context->place_num_],
context->event_);
#endif
#ifdef PADDLE_WITH_XPU
fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_]);
#endif
}
for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
++i) {
......@@ -463,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
pull_dense_worker_->Stop();
root_scope_->DropKids();
}
} // namespace framework
} // namespace paddle
#endif
......@@ -15,6 +15,7 @@
#pragma once
#include <cstdint>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......
......@@ -13,10 +13,10 @@
// limitations under the License.
#include "paddle/fluid/framework/inlined_vector.h"
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <vector>
#include "gtest/gtest.h"
namespace paddle {
......
......@@ -15,8 +15,6 @@
#include "paddle/fluid/framework/io/crypto/cipher_utils.h"
#include <cryptopp/osrng.h>
#include <fstream>
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
......
......@@ -17,6 +17,7 @@
#include <sstream>
#include <string>
#include <unordered_map>
namespace paddle {
namespace framework {
......
......@@ -18,6 +18,7 @@
#include <memory>
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/string/string_helper.h"
......
......@@ -32,6 +32,7 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/string_helper.h"
......
......@@ -20,6 +20,8 @@ namespace paddle {
namespace framework {
namespace ir {
class Graph;
class AttentionLSTMFusePass : public FusePassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册