提交 8def0e34 编写于 作者: S seiriosPlus

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/large_scale_kv_save_delta
......@@ -4,7 +4,7 @@ endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
......
......@@ -446,6 +446,9 @@ function(nv_library TARGET_NAME)
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(nv_library_SRCS)
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif()
endfunction(nv_library)
......@@ -461,6 +464,9 @@ function(nv_binary TARGET_NAME)
add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
common_link(${TARGET_NAME})
endif()
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif()
endfunction(nv_binary)
......@@ -482,6 +488,9 @@ function(nv_test TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif()
endfunction(nv_test)
......@@ -712,6 +721,7 @@ function(proto_library TARGET_NAME)
set(proto_hdrs)
paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
add_dependencies(extern_xxhash ${TARGET_NAME})
endfunction()
function(py_proto_compile TARGET_NAME)
......
......@@ -19,9 +19,8 @@ set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
"A path setting paddle inference shared and static libraries")
# TODO(zhaolong)
# At present, the size of static lib in Windows exceeds the system limit,
# so the generation of static lib is temporarily turned off.
# At present, the size of static lib in Windows is very large,
# so we need to crop the library size.
if(WIN32)
#todo: remove the option
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic." OFF)
......@@ -196,7 +195,11 @@ set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_insta
copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
if(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
else(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
endif(WIN32)
copy(inference_lib_dist
SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_fluid_c_lib}
......
......@@ -26,4 +26,7 @@ if(WITH_GPU)
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
endif()
if(WIN32)
set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
endif()
......@@ -62,9 +62,9 @@ function(op_library TARGET)
endif()
endif()
if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif()
endif()
else()
......@@ -83,7 +83,7 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
......
<?xml version="1.0" encoding="utf-8"?>
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemDefinitionGroup>
<CudaCompile>
<!-- Project schema: Host properties -->
<UseHostDefines>true</UseHostDefines>
<Emulation>false</Emulation>
<HostDebugInfo Condition="'$(Configuration)' == 'Debug'">true</HostDebugInfo>
<HostDebugInfo Condition="'$(Configuration)' != 'Debug'">false</HostDebugInfo>
<FastMath>false</FastMath>
<Optimization>InheritFromHost</Optimization>
<Runtime>InheritFromHost</Runtime>
<RuntimeChecks>InheritFromHost</RuntimeChecks>
<TypeInfo>InheritFromHost</TypeInfo>
<Warning>InheritFromHost</Warning>
<BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
<BuildCommandLineTemplate>--use-local-env</BuildCommandLineTemplate>
<BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
<CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
<!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
<HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
<DriverApiCommandLineTemplate>%(BaseCommandLineTemplate) [CompileOut] "%(FullPath)"</DriverApiCommandLineTemplate>
<RuntimeApiCommandLineTemplate>%(BaseCommandLineTemplate) [HostDebugInfo] [Emulation] [FastMath] [Defines] %(HostCommandLineTemplate) [CompileOut] "%(FullPath)"</RuntimeApiCommandLineTemplate>
<CommandLineTemplate>
# (Approximate command-line. Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)
# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(DriverApiCommandLineTemplate)
# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(RuntimeApiCommandLineTemplate)
</CommandLineTemplate>
<ExecutionDescription>Compiling CUDA source file %(Identity)...</ExecutionDescription>
<ExclusionDescription>Skipping CUDA source file %(Identity) (excluded from build).</ExclusionDescription>
<!-- Miscellaneous -->
<PropsCacheOutputFile>%(Filename)%(Extension).cache</PropsCacheOutputFile>
<PropsCacheOutputPath>$(IntDir)%(PropsCacheOutputFile)</PropsCacheOutputPath>
<CudaCompileCoreProject>$(MSBuildProjectFullPath)</CudaCompileCoreProject>
</CudaCompile>
<CudaLink>
<PerformDeviceLink>true</PerformDeviceLink>
<LinkOut>$(IntDir)$(TargetName).device-link.obj</LinkOut>
<AdditionalLibraryDirectories></AdditionalLibraryDirectories>
<UseHostLibraryDirectories>true</UseHostLibraryDirectories>
<AdditionalDependencies></AdditionalDependencies>
<UseHostLibraryDependencies>true</UseHostLibraryDependencies>
<GPUDebugInfo>InheritFromProject</GPUDebugInfo>
<Optimization>InheritFromProject</Optimization>
<!-- Implicitly inherited from the project via @(CudaCompile) -->
<CodeGeneration></CodeGeneration>
<RuntimeChecks></RuntimeChecks>
<Runtime></Runtime>
<TargetMachinePlatform></TargetMachinePlatform>
<TypeInfo></TypeInfo>
<Warning></Warning>
<Inputs></Inputs>
<!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
<HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
<LinkCommandLineTemplate>"$(CudaToolkitNvccPath)" -dlink [LinkOut] %(HostCommandLineTemplate) [AdditionalLibraryDirectories] [AdditionalDependencies] [AdditionalOptions] [CodeGeneration] [GPUDebugInfo] [TargetMachinePlatform] [Inputs]</LinkCommandLineTemplate>
<CommandLineTemplate>
# (Approximate command-line. Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)
%(LinkCommandLineTemplate)
</CommandLineTemplate>
</CudaLink>
<Link>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
</Link>
<ClCompile>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
</ClCompile>
</ItemDefinitionGroup>
</Project>
......@@ -39,6 +39,7 @@ set(third_party_deps)
# REPOSITORY ${TARGET_REPOSITORY}
# TAG ${TARGET_TAG}
# DIR ${TARGET_SOURCE_DIR})
FUNCTION(cache_third_party TARGET)
SET(options "")
SET(oneValueArgs URL REPOSITORY TAG DIR)
......@@ -269,6 +270,10 @@ if(WITH_PSLIB)
endif()
endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS)
include(external/box_ps)
......@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc)
......
......@@ -14,8 +14,6 @@ limitations under the License. */
#include "paddle/fluid/framework/attribute.h"
#include <vector>
namespace paddle {
namespace framework {
......
......@@ -30,6 +30,8 @@ namespace paddle {
namespace framework {
class ProgramDesc;
class OpDesc;
class VarDesc;
// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
// read/write speed. Only when we want the protobuf message, the local changes
......
......@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/c/c_api.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/init.h"
extern "C" {
......
......@@ -24,6 +24,15 @@ limitations under the License. */
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class OpInfoMap;
} // namespace framework
namespace platform {
class DeviceContextPool;
} // namespace platform
} // namespace paddle
#ifdef __cplusplus
extern "C" {
#endif
......
......@@ -277,7 +277,7 @@ class ChannelObject {
size_t finished = 0;
while (finished < n && WaitForWrite(lock)) {
size_t m =
std::min(n - finished, capacity_ + reading_count_ - data_.size());
(std::min)(n - finished, capacity_ + reading_count_ - data_.size());
for (size_t i = 0; i < m; i++) {
data_.push_back(std::move(p[finished++]));
}
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include <random>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
......
......@@ -21,6 +21,8 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Tensor;
void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
Tensor* out);
......
......@@ -41,6 +41,15 @@ limitations under the License. */
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace framework {
class DataFeedDesc;
class LoDTensor;
class Scope;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
......@@ -418,6 +427,7 @@ class MultiSlotType {
std::string DebugString() {
std::stringstream ss;
ss << "\ntype: " << type_ << "\n";
ss << "offset: ";
ss << "[";
......
......@@ -17,10 +17,10 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
class DataFeed;
typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
data_feedMap g_data_feed_map;
......
......@@ -16,10 +16,13 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
class DataFeed;
class DataFeedFactory {
public:
static std::string DataFeedTypeList();
......
......@@ -13,8 +13,8 @@
// limitations under the License.
#include "paddle/fluid/framework/data_layout_transform.h"
#include <string>
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -17,10 +17,18 @@
#include <map>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
} // namespace framework
} // namespace paddle
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
......
......@@ -15,7 +15,6 @@
#include "paddle/fluid/framework/data_layout_transform.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
TEST(DataTransform, DataLayoutFunction) {
auto place = paddle::platform::CPUPlace();
......
......@@ -18,8 +18,13 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_type_transform.h"
namespace paddle {
namespace framework {
class Variable;
} // namespace framework
} // namespace paddle
#ifdef PADDLE_WITH_MKLDNN
#include <algorithm>
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
......
......@@ -30,6 +30,10 @@ limitations under the License. */
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
class Variable;
void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType &kernel_type_for_var,
const Tensor &input_tensor, Tensor *out);
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#include "paddle/fluid/framework/data_type.h"
#include <stdint.h>
#include <string>
#include <unordered_map>
......
......@@ -15,12 +15,19 @@ limitations under the License. */
#pragma once
#include <string>
#include <typeindex>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -14,9 +14,17 @@
#include "paddle/fluid/framework/data_type.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
TEST(DataType, float16) {
using paddle::framework::Tensor;
using paddle::platform::CPUPlace;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <utility>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
......@@ -23,6 +24,9 @@ limitations under the License. */
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
void TransDataType(const OpKernelType& kernel_type_for_var,
......
......@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/dataset_factory.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_set.h"
namespace paddle {
namespace framework {
typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)();
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/data_set.h"
namespace paddle {
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <stdexcept>
#include <string>
#include <vector>
#include "paddle/fluid/framework/dim.h"
namespace paddle {
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sstream>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/ddim.h"
......
......@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.",
"%d and the number of places is %d.",
in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
......@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument(
"The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.",
"%d and the number of places is %d.",
in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data;
......
......@@ -20,6 +20,17 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
PADDLE_ENFORCE_EQ(
out_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
platform::errors::PreconditionNotMet(
"The number of inputs should be 1, but got %d.",
in_var_handles.size()));
PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
platform::errors::PreconditionNotMet(
"The number of outputs and the number of places should "
"be equal, but got the number of outputs is %d and the "
"number of places is %d.",
out_var_handles.size(), places_.size()));
VarHandle *in_var_handle = in_var_handles[0];
......@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
const std::vector<Scope *> &var_scopes) {
auto *in_var =
var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
in_var_handle.name()));
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
......@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
broadcast_calls.emplace_back(
[send_recv_buffer, numel, type, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclBcast(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
root_id, nccl_ctx.comm_, nccl_ctx.stream()));
});
......@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
nccl_ctxs_->DevCtx(p)->Wait();
}
#else
PADDLE_THROW("CUDA is not enabled.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
}
}
......@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
auto t_out_p = out_var_handle->place();
auto *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.",
out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
"Places of input and output must be all on GPU.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else {
t_out_p = platform::CPUPlace();
}
......
......@@ -24,6 +24,20 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct VarHandle;
} // namespace details
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -21,13 +21,15 @@
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct DummyVarHandle;
struct VarHandle;
namespace f = paddle::framework;
namespace p = paddle::platform;
......@@ -77,7 +79,8 @@ struct TestBroadcastOpHandle {
}
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
int count = 8;
......@@ -111,7 +114,8 @@ struct TestBroadcastOpHandle {
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get());
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
......@@ -169,7 +173,9 @@ struct TestBroadcastOpHandle {
float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
......@@ -192,7 +198,9 @@ struct TestBroadcastOpHandle {
}
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
......@@ -209,13 +217,24 @@ struct TestBroadcastOpHandle {
const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
PADDLE_ENFORCE_EQ(selected_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %ld.",
height, selected_rows.height()));
for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
PADDLE_ENFORCE_EQ(
selected_rows.rows()[k], rows[k],
platform::errors::InvalidArgument(
"The item at position %zu of rows of SelectedRows "
"is not equal to the expected, expect %ld, but got %ld.",
k, rows[k], selected_rows.rows()[k]));
}
p::CPUPlace cpu_place;
......@@ -233,9 +252,15 @@ struct TestBroadcastOpHandle {
framework::Scope* scope) {
p::CPUPlace cpu_place;
auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
PADDLE_ENFORCE_EQ(tensor.lod(), lod,
platform::errors::InvalidArgument(
"The LoD of tensor is not equal to "
"the expected, expect %s, but got %s.",
lod, tensor.lod()));
f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
......
......@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("reduce_mode_multi_devices_pass").get();
break;
default:
PADDLE_THROW("Unknown reduce strategy.");
PADDLE_THROW(
platform::errors::Unimplemented("Unknown reduce strategy."));
}
}
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......
......@@ -27,6 +27,18 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
class Graph;
class PassBuilder;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -19,6 +19,8 @@
namespace paddle {
namespace framework {
namespace details {
struct VarHandleBase;
ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
platform::Place place,
size_t scope_idx)
......
......@@ -24,9 +24,21 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class OperatorBase;
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
struct VarHandleBase;
class ComputationOpHandle : public OpHandleBase {
public:
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
......
......@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include <memory>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
......@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device);
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL."));
}
}
#endif
PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
"Variable names are empty."));
PADDLE_ENFORCE_NE(vars.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted are empty."));
for (auto *var : var_infos_) {
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
"The memory optimization info is NULL."));
}
}
......@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device);
PADDLE_ENFORCE(cudaEventDestroy(event_));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
}
#endif
}
......@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
}
void EagerDeletionOpHandle::CallOnce() {
PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here");
PADDLE_ENFORCE_EQ(
vars_.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted should be initialized here."));
Scope *exec_scope = local_exec_scopes_[0];
for (auto *var_info : var_infos_) {
auto *var = exec_scope->FindVar(var_info->Name());
PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr",
var_info->Name());
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
var_info->Name()));
vars_.emplace_back(var);
}
}
......@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
garbages.emplace_back(t.MoveMemoryHolder());
}
} else {
PADDLE_THROW("Type %s of %s is not supported eager deletion",
framework::ToTypeName(var->Type()), var_info->Name());
PADDLE_THROW(platform::errors::Unimplemented(
"The variable(%s) of type %s is not supported in eager deletion.",
framework::ToTypeName(var->Type()), var_info->Name()));
}
}
......@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0));
};
gc_->Add(std::move(*garbages), callback_func);
} else {
......
......@@ -19,12 +19,23 @@
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
class Scope;
class GarbageCollector;
namespace ir {
class Node;
} // namespace ir
namespace ir {
class MemOptVarInfo;
......
......@@ -13,8 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/exception_holder.h"
#include <memory>
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator.h"
......
......@@ -15,9 +15,14 @@
#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -22,6 +22,18 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class LoDTensor;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -19,6 +19,8 @@
namespace paddle {
namespace framework {
namespace details {
struct VarHandleBase;
FetchBarrierOpHandle::FetchBarrierOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
......
......@@ -24,6 +24,15 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......@@ -32,6 +41,8 @@ namespace details {
// all places if there are multiple places, must init with
// multiple dev_ctxes_ !!!!
struct VarHandleBase;
struct FetchBarrierOpHandle : public OpHandleBase {
public:
FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
......
......@@ -22,6 +22,17 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
......@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places.");
platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of places plus the number of all reduce handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of all reduce handles "
"is %d.",
in_var_handles.size(), place_num, num_of_all_reduce_));
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of output variable handles, but got the number of input variable "
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
......@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
dtype = ele_dtype;
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
PADDLE_ENFORCE_EQ(
ele_dtype, dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(dtype)));
// Check whether the address space is contiguous.
std::sort(
......@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
"input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address,
"The address is not consistent.");
PADDLE_ENFORCE_EQ(
infer_next_address, next_address,
platform::errors::InvalidArgument(
"The infered address of the next tensor should be equal to the "
"real address of the next tensor. But got infered address is %p "
"and real address is %p.",
infer_next_address, next_address));
}
}
if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first);
PADDLE_ENFORCE_EQ(
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first,
platform::errors::InvalidArgument(
"The variable name of grad tensors of "
"fused_all_reduce_op_handle "
"must be consistent. The current name is %s, but the "
"previous name is %s.",
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first));
}
}
}
......@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true;
......@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
PADDLE_ENFORCE_EQ(
var_name, out_var_handles[j]->name(),
platform::errors::InvalidArgument(
"The name of input variable should be equal "
"to the name of output variable. But got the name of input "
"variable is %s and the name of output variable is %s.",
var_name, out_var_handles[j]->name()));
auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
true, "%s(%d) is not in the right place.", var_name, scope_idx);
true, platform::errors::InvalidArgument(
"The variable '%s' at scope %d is not in the right place.",
var_name, scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
}
}
......@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get dtype
auto ele_type = grad_tensor.at(i).second->type();
auto ele_dtype = grad_tensor.at(i).second->type();
if (i == 0) {
*dtype = ele_type;
size_of_dtype = framework::SizeOfType(ele_type);
*dtype = ele_dtype;
size_of_dtype = framework::SizeOfType(ele_dtype);
}
PADDLE_ENFORCE_EQ(ele_type, *dtype);
PADDLE_ENFORCE_EQ(
ele_dtype, *dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
PADDLE_ENFORCE_GT(
len, 0, platform::errors::InvalidArgument(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
......
......@@ -17,10 +17,22 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
WaitInputVarGenerated();
size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
PADDLE_ENFORCE_EQ(
in_var_handles.size() * place_num, out_var_handles.size(),
platform::errors::PreconditionNotMet(
"The number of input variable handles plus the number "
"of places should be equal to the number of output variable handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of output variable handles "
"is %d.",
in_var_handles.size(), place_num, out_var_handles.size()));
for (size_t i = 0; i < in_var_handles.size(); ++i) {
BroadcastOneVar(
......
......@@ -25,6 +25,17 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -13,15 +13,26 @@
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include <memory>
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
struct VarHandle;
struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::string> out_varnames_;
std::vector<std::unique_ptr<ir::Node>> nodes_;
......@@ -49,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else
PADDLE_THROW("CUDA is not supported.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
......@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
platform::errors::InvalidArgument(
"The number of input variables should be equal "
"to the number of places, but got the number of input variables is "
"%d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one.");
PADDLE_ENFORCE_EQ(
out_var_handles.size(), 1,
platform::errors::InvalidArgument(
"The number of output variables should be 1, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front();
}
......@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
auto in_0_handle = in_var_handles[0];
auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(
pre_in_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
in_0_handle->name()));
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
"Currently, gather_op only can gather SelectedRows.");
PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
platform::errors::Unimplemented(
"Currently, gather_op only supports SelectedRows."));
// Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated();
......@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
for (auto *in_handle : in_var_handles) {
auto *in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var,
platform::errors::NotFound(
"The variable '%s' is not found in the scope.", in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
auto &in_sr_value = in_var->Get<framework::SelectedRows>();
......@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform::Place t_out_p = out_var_handle->place();
if (platform::is_gpu_place(pre_in_value.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
"Places of input and output must be all on GPU.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else {
t_out_p = platform::CPUPlace();
}
auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
out_var_handle->name()));
auto out_value = out_var->GetMutable<framework::SelectedRows>();
out_value->set_height(pre_in_value.height());
out_value->set_rows(out_rows);
......
......@@ -24,6 +24,14 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -13,15 +13,17 @@
// limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h"
#include <memory>
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
namespace details {
struct DummyVarHandle;
namespace f = paddle::framework;
namespace p = paddle::platform;
......@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else {
int count = 8;
......@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
}
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
......@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
}
f::Tensor result_tensor;
......@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
test_op.TestGatherSelectedRows(input_scope_idx);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -20,16 +20,21 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
class OpDesc;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -19,6 +19,12 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
}
virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
}
for (auto& ev : exter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
}
}
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0");
PADDLE_ENFORCE_GE(
run_order, 0,
platform::errors::InvalidArgument(
"The argument run_order must be >= 0, but got %d.", run_order));
run_order_ = run_order;
use_hierarchical_allreduce_ = use_hierarchical_allreduce;
......@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
return;
}
PADDLE_ENFORCE(places_.size() == 1,
"HierarchicalAllReduce run one proc with one card mode.");
PADDLE_ENFORCE_EQ(places_.size(), 1,
platform::errors::InvalidArgument(
"HierarchicalAllReduce can only run "
"one proccess with one card mode, but got %d cards.",
places_.size()));
for (auto& p : places_) {
auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
......@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
continue;
}
PADDLE_ENFORCE(cudaSetDevice(dev_id));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id],
cudaEventDisableTiming));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id],
cudaEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
&exter_events_[dev_id], cudaEventDisableTiming));
VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id];
......@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
......@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place;
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream));
}
void NCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
if (!use_hierarchical_allreduce_) {
FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
return;
......@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
InterReduce(place, sendbuff, recvbuff, count, datatype, op);
// When a trainer is not in exter allreduce ring
// they need not to call this.
......@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream;
PADDLE_ENFORCE(platform::dynload::ncclReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
cudaEventRecord(inter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream),
"sync HierarchicalAllReduce inter stream error");
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
}
......@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_);
PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream();
......@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream));
cudaEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream),
"sync HierarchicalAllReduce exter stream error");
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
}
......@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", stream:" << stream;
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0,
comm, stream));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
sendbuff, count, datatype, 0, comm, stream));
}
protected:
......
......@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA
for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id));
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
}
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
......@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
}
}
} else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
"%s should have only one dev_ctx.", Name());
PADDLE_ENFORCE_EQ(
dev_ctxes_.size(), 1UL,
platform::errors::InvalidArgument(
"Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()),
PADDLE_ENFORCE_EQ(
platform::is_same_place(place, out_var_handle->place()), true,
platform::errors::InvalidArgument(
"The place of output(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name());
out_var_handle->Name(), Name()));
out_var_handle->SetGenerateEvent(events_.at(dev_id));
}
}
......@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
InitCUDA();
}
#else
PADDLE_ENFORCE(!use_cuda);
PADDLE_ENFORCE_EQ(use_cuda, false,
platform::errors::InvalidArgument(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."));
#endif
// skip running current op, used with inplace_addto_op_pass
......@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_NOT_NULL(waited_ctx);
PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
PADDLE_ENFORCE_NOT_NULL(
dev_ctx.second,
platform::errors::InvalidArgument("The device context is NULL."));
dev_ctx.second->Wait();
}
} else {
auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
}
}
#else
......@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream();
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
}
// There are nothing to do when the place is CPUPlace.
......@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place()))
->stream();
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
}
// There are nothing to do when the place is CPUPlace.
......@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
auto scopes = GetLocalScopes();
for (auto *scope : scopes) {
auto iter = scope_map.find(scope);
PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found");
PADDLE_ENFORCE_NE(
iter, scope_map.end(),
platform::errors::NotFound("Local scope not found in scope map."));
local_exec_scopes_.emplace_back(iter->second);
}
}
......
......@@ -24,10 +24,22 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
class Scope;
namespace details {
struct VarHandleBase;
} // namespace details
namespace ir {
class Node;
} // namespace ir
namespace details {
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
......@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
void operator()(const char* op_type, OpInfo* info) const {
PADDLE_ENFORCE_EQ(info->proto_, nullptr,
platform::errors::AlreadyExists(
"OpProto of %s has been registered", op_type));
"OpProto of %s has been registered.", op_type));
PADDLE_ENFORCE_EQ(info->checker_, nullptr,
platform::errors::AlreadyExists(
"OpAttrChecker of %s has been registered", op_type));
"OpAttrChecker of %s has been registered.", op_type));
info->proto_ = new proto::OpProto;
info->checker_ = new OpAttrChecker();
T maker;
maker(info->proto_, info->checker_);
info->proto_->set_type(op_type);
PADDLE_ENFORCE(
info->proto_->IsInitialized(),
"Fail to initialize %s's OpProto, because %s is not initialized",
op_type, info->proto_->InitializationErrorString());
PADDLE_ENFORCE_EQ(
info->proto_->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"Fail to initialize %s's OpProto, because %s is not initialized.",
op_type, info->proto_->InitializationErrorString()));
}
};
......
......@@ -16,6 +16,7 @@
#include <algorithm>
#include <map>
#include <vector>
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
......@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
template <typename T>
void apply() const {
PADDLE_ENFORCE(!src_tensors_.empty());
PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
platform::errors::InvalidArgument(
"The number of tensors to be reduced is 0."));
auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0);
PADDLE_ENFORCE_NE(t0.numel(), 0,
platform::errors::InvalidArgument(
"The size of first tensor to be reduced is 0."));
dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
......@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
continue;
}
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
PADDLE_ENFORCE_EQ(t.type(), t0.type());
PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
platform::errors::InvalidArgument(
"The shape of tensors to be reduced must be "
"consistent. The shape of current tensor is %s, "
"but the shape of the first tensor is %s.",
t.dims(), t0.dims()));
PADDLE_ENFORCE_EQ(t.type(), t0.type(),
platform::errors::InvalidArgument(
"The type of tensors to be reduced must be "
"consistent. The type of current tensor is %s, "
"but the type of the first tensor is %s.",
t.type(), t0.type()));
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
[](T a, T b) -> T { return a + b; });
}
......@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
in_places_(in_places),
out_place_(out_place),
dst_selected_rows_(dst_selected_rows) {
PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
platform::errors::InvalidArgument(
"The number of selected_rows to be gathered is 0."));
std::vector<int64_t> out_rows;
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <memory>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
......@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
PADDLE_ENFORCE(remote.size() == vars.size());
PADDLE_ENFORCE_EQ(
client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
platform::errors::PreconditionNotMet(
"The number of remotes should be equal to the number "
"of variables to be gathered, but got the number of "
"remotes is %d and the number of variables is %d.",
remote.size(), vars.size()));
// 4. merged local selected rows.
std::vector<const SelectedRows *> all;
......@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places.");
platform::errors::InvalidArgument(
"The number of inputs should equal to the number of places, but got "
"the number of inputs is %d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle;
{
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one.");
platform::errors::InvalidArgument(
"The number of output should be one, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front();
}
......@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
"Variable %s is not found in scope.",
in_0_handle->name()));
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx
......@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
in_places.emplace_back(in_handle->place());
auto in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
}
auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound("Variable %s is not found in scope.",
out_var_handle->name()));
// NOTE: The tensors' Place of input and output must be all on GPU or all on
// CPU.
auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
platform::Place t_out_p;
if (platform::is_gpu_place(in_p)) {
PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()),
"Places of input and output must be all on GPU.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
t_out_p = out_var_handle->place();
} else {
t_out_p = platform::CPUPlace();
......@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
} else {
PADDLE_THROW("only support double or float when gather SelectedRows");
PADDLE_THROW(platform::errors::Unimplemented(
"Only support double or float when gather SelectedRows, but got "
"%s.",
framework::DataTypeToString(in_selected_rows[0]->value().type())));
}
#endif
});
......@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
size_t numel = static_cast<size_t>(lod_tensor.numel());
all_reduce_calls.emplace_back(
[buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
});
......@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
}
});
#else
PADDLE_THROW("CUDA is not enabled.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif
} else {
PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
PADDLE_THROW(platform::errors::InvalidArgument(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
lod_tensors[0]->place()));
}
}
}
......
......@@ -24,6 +24,21 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class SelectedRows;
namespace details {
struct VarHandle;
} // namespace details
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
}
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
int count = 8;
......@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif
} else {
#if defined(PADDLE_WITH_NCCL)
......@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
......@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
}
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
......@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
auto &out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
}
f::Tensor result_tensor;
......@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
}
}
} // namespace details
void TestReduceLodTensors(size_t output_scope_idx) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
......@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod);
......@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
}
auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
......@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
}
}
};
}; // namespace details
TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op;
......
......@@ -24,6 +24,16 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class OpDesc;
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -13,9 +13,17 @@
// limitations under the License.
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include <string>
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -21,6 +21,18 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -17,7 +17,9 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace details {
......
......@@ -22,6 +22,17 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
class Scope;
class Tensor;
class Variable;
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......@@ -100,12 +111,11 @@ void ShareTensorBufferFunctor::CallOnce() {
auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The input variable(%s)to be inplaced should not be NULL.",
"The variable(%s) to be inplaced is not found in scope.",
in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound(
"The output variable(%s) to be inplaced should not be NULL.",
out_var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
out_var_names_[i]));
PADDLE_ENFORCE_NE(
in_var, out_var,
......
......@@ -25,6 +25,15 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -23,10 +23,20 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
class ComputationOpHandle;
ComputationOpHandle *GetUniquePendingComputationOpHandle(
ShareTensorBufferOpHandle *share_tensor_op) {
ComputationOpHandle *result_op = nullptr;
......
......@@ -22,10 +22,22 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class MemOptVarInfo;
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
class ComputationOpHandle;
class ShareTensorBufferOpHandle : public OpHandleBase {
public:
ShareTensorBufferOpHandle(
......
......@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
#include <algorithm>
#include <utility>
#include "dgc/dgc.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
......@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_(is_encoded),
nranks_(nranks) {
// TODO(gongwb) :polish them!
PADDLE_ENFORCE_EQ(is_encoded, true);
PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
"The argument is_encoded is false."));
VLOG(1) << "Use dgc allreduce mode"
<< ", nranks:" << nranks_;
PADDLE_ENFORCE_GT(local_scopes_.size(), 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto nranks_name = g_dgc_nranks;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *local_scope = local_scopes_[i];
auto nranks_var = local_scope->FindVar(nranks_name);
if (nranks_var == nullptr) {
PADDLE_THROW("not find nranks_var:%s", nranks_name);
}
PADDLE_ENFORCE_NOT_NULL(
nranks_var, platform::errors::NotFound(
"Variable %s is not found in scope.", nranks_name));
float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
*dgc_nranks = nranks;
......@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places.");
platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"places, but got the number of input variables is %zu and the the "
"number of places is %zu.",
in_var_handles.size(), places_.size()));
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"output variables, but got the number of input variables is %zu and "
"the the number of output variables is %zu.",
in_var_handles.size(), out_var_handles.size()));
std::vector<const LoDTensor *> ins;
std::vector<LoDTensor *> gathers;
......@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
encode_var_name));
auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in);
auto gather_var_name = original_name + g_dgc_gather;
auto *gather_var = local_scope->FindVar(gather_var_name);
PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null",
gather_var_name);
PADDLE_ENFORCE_NOT_NULL(
gather_var, platform::errors::NotFound(
"Variable %s is not found in scope.", gather_var));
auto *gather = gather_var->GetMutable<LoDTensor>();
gathers.emplace_back(gather);
......@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
}
}
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ins[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
ins[0]->place()));
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(outs[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
outs[0]->place()));
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
"The nccl contexts are NULL."));
int dtype = -1;
size_t in_numel = 0;
size_t out_numel = 0;
PADDLE_ENFORCE(nranks_ > 1);
PADDLE_ENFORCE_GT(
nranks_, 1,
platform::errors::PreconditionNotMet(
"The number of ranks should be > 1, but got %d.", nranks_));
std::vector<std::function<void()>> all_gather_calls;
std::vector<std::function<void()>> sparse_reduce_calls;
......@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
PADDLE_ENFORCE(in_numel % 2 == 0);
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
PADDLE_ENFORCE_EQ(in_numel % 2, 0,
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
......@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
gather_buff, k, out_tensor_buf,
static_cast<int>(out_numel), nranks_, stream),
true);
true, platform::errors::Unavailable(
"Calling sparseReduce() failed."));
});
}
......@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
auto original_name = paddle::framework::GradOriginalVarName(grad_name);
auto var_name = original_name + g_dgc_k;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *scope = local_exec_scopes_[0];
auto var = scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
auto tensor = var->Get<LoDTensor>().data<float>();
return *tensor;
}
......@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
}
auto counter_name = g_dgc_counter_name;
auto step_name = g_dgc_rampup_begin_step;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *local_scope = local_exec_scopes_[0];
auto count_var = local_scope->FindVar(counter_name);
auto step_var = local_scope->FindVar(step_name);
if (count_var == nullptr || step_var == nullptr) {
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
step_var);
}
PADDLE_ENFORCE_NOT_NULL(
count_var, platform::errors::NotFound(
"Variable %s is not found in scope.", counter_name));
PADDLE_ENFORCE_NOT_NULL(
step_var, platform::errors::NotFound("Variable %s is not found in scope.",
step_var));
float count = *count_var->Get<LoDTensor>().data<float>();
float step = *step_var->Get<LoDTensor>().data<float>();
......
......@@ -23,6 +23,17 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -24,6 +24,14 @@
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -15,6 +15,14 @@
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/selected_rows.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -17,6 +17,13 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace details {
......
......@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "xxhash.h" // NOLINT
namespace paddle {
namespace framework {
class LoDTensor;
class Scope;
void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
......
......@@ -39,6 +39,18 @@ limitations under the License. */
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
class LoDTensor;
class ProgramDesc;
class Scope;
class Tensor;
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -62,7 +74,9 @@ class PullDenseWorker {
virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
void AddPlace(const paddle::platform::Place place) {
places_.push_back(place);
}
......@@ -123,9 +137,9 @@ class PullDenseWorker {
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
#endif
std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_;
#endif
};
// should incorporate different type of device
......@@ -149,6 +163,7 @@ class DeviceWorker {
virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void GetXpuOpIndex() {}
virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field;
}
......
......@@ -20,6 +20,8 @@ limitations under the License. */
namespace paddle {
namespace framework {
class DeviceWorker;
typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)();
typedef std::unordered_map<std::string, Createdevice_workerFunction>
device_workerMap;
......
......@@ -16,11 +16,14 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/device_worker.h"
namespace paddle {
namespace framework {
class DeviceWorker;
class DeviceWorkerFactory {
public:
static std::string DeviceWorkerTypeList();
......
......@@ -13,9 +13,10 @@
// limitations under the License.
#include "paddle/fluid/framework/device_worker.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/trainer.h"
namespace paddle {
namespace framework {
......
......@@ -97,6 +97,7 @@ message AsyncConfig {
optional int32 thread_pool_size = 6 [ default = 1 ];
optional int32 send_wait_times = 7 [ default = 1 ];
optional bool runtime_split_send_recv = 8 [ default = false ];
optional bool launch_barrier = 9 [ default = true ];
}
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
......@@ -127,6 +128,7 @@ message DistributedStrategy {
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional bool adaptive_localsgd = 24 [ default = false ];
optional bool fp16_allreduce = 25 [ default = false ];
optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
......
......@@ -11,10 +11,17 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/dlpack_tensor.h"
#include <unordered_map>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/dlpack_tensor.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -15,11 +15,14 @@
#pragma once
#include <dlpack/dlpack.h>
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace framework {
class Tensor;
class DLPackTensor {
public:
using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t
......
......@@ -17,6 +17,12 @@
#include <gtest/gtest.h>
#include <vector>
namespace paddle {
namespace platform {
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace framework {
......
......@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
} // namespace framework
} // namespace paddle
#if defined _WIN32 || defined __APPLE__
#else
......
......@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <set>
#include <unordered_map>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/lodtensor_printer.h"
namespace paddle {
namespace framework {
class OpDesc;
class OperatorBase;
class ProgramDesc;
bool HasDependentOutput(const OpDesc& op_desc,
const std::unordered_set<std::string>& dependent_vars) {
for (auto& var : op_desc.Outputs()) {
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include "paddle/fluid/framework/tensor.h"
#include "unsupported/Eigen/CXX11/Tensor"
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/garbage_collector.h"
......@@ -32,6 +33,11 @@ limitations under the License. */
namespace paddle {
namespace framework {
class Dataset;
class ProgramDesc;
class Scope;
class TrainerBase;
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
......
......@@ -13,16 +13,19 @@
// limitations under the License.
#include "paddle/fluid/framework/executor_gc_helper.h"
#include <deque>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......
......@@ -18,6 +18,7 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
......@@ -26,6 +27,10 @@ namespace paddle {
namespace framework {
// Result map: op -> variable names that can be deleted after op runs
class GarbageCollector;
class OperatorBase;
class Scope;
std::unordered_map<const OperatorBase *, std::vector<std::string>>
GetUnusedVars(const BlockDesc &block,
const std::vector<std::unique_ptr<OperatorBase>> &ops,
......
......@@ -13,16 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
......
......@@ -15,12 +15,16 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Scope;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index);
......
......@@ -29,12 +29,6 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
......@@ -751,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
push_sparse_status->push_back(std::move(status));
}
}
#endif
#ifdef PADDLE_WITH_XPU
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place) {
#ifdef PADDLE_WITH_PSLIB
std::vector<paddle::ps::Region> regions;
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int count = tensor->numel();
float* g_data = tensor->data<float>();
Variable* pin_var = scope.FindVar(t + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g =
pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
memory::Copy(platform::CPUPlace(), pin_g,
BOOST_GET_CONST(platform::XPUPlace, place), g_data,
sizeof(float) * count);
float* g = pin_g;
if (scale_datanorm >= 0) {
if (t.find(".batch_size@GRAD") != std::string::npos ||
t.find(".batch_sum@GRAD") != std::string::npos) {
Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
float scale = 1.0 / batch_size;
mat *= scale;
} else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
VLOG(3) << "epsilon: " << scale_datanorm;
for (int i = 0; i < count; ++i) {
g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
batch_size * scale_datanorm;
}
}
}
paddle::ps::Region reg(g, count);
regions.emplace_back(std::move(reg));
}
auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
regions.size(), table_id);
if (push_sparse_status) {
push_sparse_status->push_back(std::move(status));
}
#endif
}
#endif
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
......
......@@ -35,6 +35,12 @@ limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
......@@ -154,6 +160,14 @@ class FleetWrapper {
float scale_datanorm, int batch_size,
const paddle::platform::Place& place, cudaStream_t stream,
cudaEvent_t event);
#endif
#ifdef PADDLE_WITH_XPU
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place);
#endif
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
......
......@@ -10,10 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/string/string_helper.h"
namespace gloo {
......
......@@ -27,15 +27,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/timer.h"
#ifdef PADDLE_WITH_PSLIB
namespace paddle {
......@@ -122,29 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
if (platform::is_cpu_place(tensor->place())) {
memcpy(data_ptr, tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
}
} else {
#ifdef PADDLE_WITH_CUDA
else {
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()), nullptr);
}
#endif
#ifdef PADDLE_WITH_XPU
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
}
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place,
cudaStream_t stream) {
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
std::vector<int> vec_dim;
for (auto& x : req_var.dims()) {
vec_dim.push_back(x);
}
tensor->Resize(make_ddim(vec_dim));
LoD lod;
for (int i = 0; i < req_var.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
v.push_back(req_var.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
#else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
#endif
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place) {
#endif
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
......@@ -168,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
#ifdef PADDLE_WITH_XPU
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
tensor->numel() * SizeOfType(tensor->type()));
#else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
......@@ -192,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
case VariableMessage::BOOL:
return framework::proto::VarType::BOOL; // NOLINT
default:
VLOG(0) << "Not support type " << type;
PADDLE_THROW(platform::errors::InvalidArgument(
"ToVarType:Unsupported type %d", type));
}
}
......@@ -239,7 +268,7 @@ void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
request.set_cur_batch(task->cur_batch_);
OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
auto* closure = (OnHeterRpcDone*)done;
auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
if (closure->cntl.Failed()) {
VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
} else {
......
......@@ -13,9 +13,6 @@
// limitations under the License.
#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
#include <utility>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册