提交 62a17f50 编写于 作者: C chenweihang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into unsqueeze_op

......@@ -23,7 +23,7 @@ ENV HOME /root
COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \
apt-get install -y --allow-downgrades \
apt-get install -y --allow-downgrades patchelf \
git python-pip python-dev python-opencv openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
......
......@@ -257,8 +257,8 @@ function(cc_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......@@ -324,8 +324,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
......
......@@ -14,6 +14,15 @@ DistributeTranspiler
:members:
:noindex:
.. _api_fluid_transpiler_InferenceTranspiler:
InferenceTranspiler
-------------------
.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
:members:
:noindex:
.. _api_fluid_transpiler_memory_optimize:
memory_optimize
......
# Distributed Training with NCCL2
We design a pattern that can enable training with `ParallelExecutor` and
using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
communication library.
In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
to do multi GPU training. And if we initialize NCCL2 communicators as
ranks in a distributed environment, we can simply run the `ParallelExecutor`
as a distributed program! The only thing that may be different than in
the single node version is that we need to broadcast the NCCL unique ID
to all the nodes, and initialize communicators using that ID, so NCCL2
will know each other as ranks.
To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
what ever platform you like.
It have two running modes:
1. Generate and broadcast mode, which should be used on trainer 0;
1. Listen and fetch mode, which should be used on trainers other than 0.
In both two modes, this op can save the NCCL ID into current scope as a
persistable variable, Then we can insert this op at the end of
"startup program" of fluid, so that all workers can get the same ID to
initialize NCCL communicator objects.
<img src="src/ncc2_design.png">
The above figure indicates the general process when training with NCCL2
distributed. Each trainer have the number of communicators equal to the
number of GPUs, but the ranks should match the global ranks number: here
we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
......@@ -119,6 +119,32 @@ optimization algorithm $f$ runs on the storage service.
- Con: the storage service needs to be able to run the optimization
algorithm.
## Distributed Sparse Table in Fluid
For another design, we can implement a distributed sparse table in Fluid,
and don't need to maintain an external storage component while training.
You may need to read Fluid [Distributed Training Architecture](./distributed_architecture.md)
and [Parameter Server](./parameter_server.md) before going on.
![fluid lookup remote table](./src/fluid_lookup_remote_table.png)
Partition a large table into multiple pserver instances
1. `DistributeTranspiler` would split the table partitioned into some small
table blocks with some partitioned algorithms such as
[RoundRobin](https://en.wikipedia.org/wiki/Round-robin_scheduling),
[Hash](https://en.wikipedia.org/wiki/Hash) and etc...
1. For some cases, the range of input `Ids` is very wide and unpredictable, so the sparse
table would be able to fill a new value for the id that didn't appear before with
zero, uniform random or Gaussian distribution.
For each Trainer's training process:
1. In the forward pass, we use `pre-fetch` op to pre-fetch parameter blocks according to the
input `Ids` from PServers instead of the local `lookup_table` op, and then merge the blocks
into a parameter `W`.
1. Compute `GRAD@W'` in the backward pass using the pre-fetched `W` and send it to PServer to
execute the optimize pass.
## Conclusion
Let us do the "storage service does not optimize" solution first, as a
......
......@@ -22,9 +22,9 @@
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......
......@@ -21,10 +21,10 @@ endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
......@@ -38,7 +38,7 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry init math_function)
DEPS operator op_registry device_context math_function)
if(WITH_GPU)
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
......@@ -63,7 +63,7 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
......@@ -101,14 +101,14 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
# cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
conditional_block_op while_op assign_op print_op executor proto_desc)
# disable test temporarily.
# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
# channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
# conditional_block_op while_op assign_op print_op executor proto_desc)
......@@ -14,13 +14,13 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
namespace paddle {
namespace framework {
......
......@@ -17,9 +17,9 @@
#include <stdio.h>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
__global__ void test(size_t* a, int size) {
......
......@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/init.h"
namespace paddle {
namespace framework {
......
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api
......
......@@ -54,4 +54,5 @@ It can be used as a helper class that draws the modified graph after each pass.
There is some helper legacy/function/class for analysis.
- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
there are some implementations in [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS..
......@@ -32,19 +32,6 @@ class Pass {
public:
Pass() = default;
virtual ~Pass() = default;
// Virtual method overridden by subclasses to do only necessary initialization
// before any pass is run.
// virtual bool Initialize() { return false; }
// There is some passes such as FlowToDataFlowGraphPass that needs a
// ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
// only couple with the proto file.
// virtual bool Initialize(const framework::proto::ProgramDesc &desc) { return
// false; }
// There are some Passes such as DataFlowGraphToFluidPass that will output a
// ProgramDesc.
// virtual bool Initialize(framework::proto::ProgramDesc *desc) { return
// false; }
// Mutable Pass.
virtual bool Initialize(Argument *argument) { return false; }
// Readonly Pass.
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
......@@ -33,7 +33,7 @@ namespace inference {
void Init(const std::vector<std::string> argv) {
framework::InitGflags(argv);
operators::math::SetNumThreads(FLAGS_math_num_threads);
platform::SetNumThreads(FLAGS_math_num_threads);
// init devices
std::vector<int> devices;
std::string token;
......
......@@ -18,9 +18,9 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/init.h"
namespace paddle {
namespace inference {
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/cpu_helper.h"
#ifdef PADDLE_WITH_MKLML
#include <omp.h>
#endif
......@@ -164,7 +164,7 @@ TEST(inference, nlp) {
// only use 1 thread number per std::thread
omp_set_dynamic(0);
omp_set_num_threads(1);
paddle::operators::math::SetNumThreads(1);
paddle::platform::SetNumThreads(1);
#endif
double start_ms = 0, stop_ms = 0;
......
......@@ -5,7 +5,7 @@ if(WITH_GRPC)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc SERIAL)
cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
proto_desc lookup_table_op SERIAL)
......
......@@ -54,13 +54,13 @@ math_library(softmax DEPS math_function)
math_library(unpooling)
math_library(vol2col)
cc_test(math_function_test SRCS math_function_test.cc)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
if(WITH_GPU)
nv_test(math_function_gpu_test SRCS math_function_test.cu)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat)
......@@ -23,41 +23,12 @@
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h>
#endif
#endif
#ifndef LAPACK_FOUND
extern "C" {
#include <cblas.h> // NOLINT
int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
int* ipiv);
int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
int* ipiv);
int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
const int* ipiv);
int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
const int* ipiv);
}
#endif
namespace paddle {
namespace operators {
namespace math {
static void SetNumThreads(int num_threads) {
#ifdef PADDLE_USE_OPENBLAS
int real_num_threads = num_threads > 1 ? num_threads : 1;
openblas_set_num_threads(real_num_threads);
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
platform::dynload::MKL_Set_Num_Threads(real_num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
}
/**
* Matrix Descriptor of a memory buffer.
*
......
......@@ -19,23 +19,6 @@ limitations under the License. */
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h>
#endif
#endif
#ifndef LAPACK_FOUND
extern "C" {
#include <cblas.h> // NOLINT
int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
int* ipiv);
int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
int* ipiv);
int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
const int* ipiv);
int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
const int* ipiv);
}
#endif
#include <cmath>
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -27,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
USE_NO_KERNEL_OP(ncclInit);
......
......@@ -28,6 +28,9 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload)
cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
IF(WITH_GPU)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
ELSE()
......@@ -42,10 +45,12 @@ ENDIF()
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc DEPS malloc
place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
......@@ -53,5 +58,5 @@ cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framewo
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu)
cc_test(float16_test SRCS float16_test.cc)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#endif
namespace paddle {
namespace platform {
void SetNumThreads(int num_threads) {
#ifdef PADDLE_USE_OPENBLAS
int real_num_threads = num_threads > 1 ? num_threads : 1;
openblas_set_num_threads(real_num_threads);
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
platform::dynload::MKL_Set_Num_Threads(real_num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
namespace paddle {
namespace platform {
//! Set the number of threads in use.
void SetNumThreads(int num_threads);
} // namespace platform
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/cpu_helper.h"
#include "gtest/gtest.h"
TEST(CpuHelper, SetNumThread) {
paddle::platform::SetNumThreads(1);
paddle::platform::SetNumThreads(4);
}
......@@ -69,19 +69,3 @@ TEST(Device, DeviceContextPool) {
ASSERT_NE(dev_ctx, nullptr);
}
}
int main(int argc, char** argv) {
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
}
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -36,8 +36,6 @@ DEFINE_string(cuda_dir, "",
DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
DEFINE_string(nccl_dir, "",
"Specify path for loading nccl library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
......@@ -189,14 +187,6 @@ void* GetWarpCTCDsoHandle() {
#endif
}
void* GetLapackDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
#endif
}
void* GetNCCLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
......
......@@ -23,7 +23,6 @@ void* GetCUDNNDsoHandle();
void* GetCUPTIDsoHandle();
void* GetCurandDsoHandle();
void* GetWarpCTCDsoHandle();
void* GetLapackDsoHandle();
void* GetNCCLDsoHandle();
void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle();
......
......@@ -13,8 +13,8 @@ limitations under the License. */
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/init.h"
namespace paddle {
namespace platform {
......
......@@ -16,10 +16,10 @@ limitations under the License. */
#include <stdexcept>
#include <string>
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/piece.h"
......@@ -115,7 +115,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
places.emplace_back(platform::CPUPlace());
platform::DeviceContextPool::Init(places);
#ifndef PADDLE_WITH_MKLDNN
operators::math::SetNumThreads(1);
platform::SetNumThreads(1);
#endif
}
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
TEST(InitDevices, CPU) {
using paddle::framework::InitDevices;
......
......@@ -2,13 +2,13 @@ if(WITH_PYTHON)
if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
parallel_executor
${GLOB_OP_LIB})
else()
cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
parallel_executor
${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID)
......
......@@ -24,7 +24,6 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
......@@ -36,6 +35,7 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/const_value.h"
......
......@@ -83,6 +83,13 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
}
template <typename... Args>
std::string Sprintf(const Args&... args) {
std::ostringstream oss;
Fprintf(oss, "");
return oss.str();
}
template <typename... Args>
std::string Sprintf(const char* fmt, const Args&... args) {
std::ostringstream oss;
......
......@@ -27,4 +27,5 @@ TEST(StringPrintf, StringPrintf) {
EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
hour, min));
EXPECT_EQ(std::string(""), paddle::string::Sprintf());
}
......@@ -15,11 +15,11 @@
#include <fstream>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......
......@@ -318,7 +318,7 @@ function assert_api_not_changed() {
virtualenv .env
source .env/bin/activate
pip install ${PADDLE_ROOT}/build/python/dist/*whl
curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/reyoung/FluidAPISpec/master/API.spec} \
curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \
> origin.spec
python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec
......
......@@ -6,6 +6,6 @@ if(WITH_TESTING)
add_library(paddle_test_util STATIC TestUtil.cpp)
add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
if(NOT MOBILE_INFERENCE)
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
endif()
endif()
......@@ -16,8 +16,8 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/init.h"
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
......
......@@ -552,10 +552,10 @@ class DistributeTranspiler(object):
op_on_pserver = True
new_outputs[key] = pserver_vars[op.output(key)[0]]
if op_on_pserver:
# most startup program ops have no inputs
new_inputs = self._get_input_map_from_op(pserver_vars, op)
if op_on_pserver:
if op.type in [
"gaussian_random", "fill_constant", "uniform_random"
]:
......
......@@ -19,7 +19,7 @@ from ..framework import Program
from ..executor import global_scope
class InferenceTranspiler:
class InferenceTranspiler(object):
'''
Convert the fluid program to optimized inference program.
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# used for setup.py.in to store the thirdparty shared libraries
from setuptools import setup, Distribution, Extension
import subprocess
import shutil
import os
class BinaryDistribution(Distribution):
def has_ext_modules(foo):
return True
......@@ -62,6 +64,7 @@ write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
packages=['paddle',
'paddle.libs',
'paddle.utils',
'paddle.dataset',
'paddle.reader',
......@@ -114,11 +117,34 @@ package_dir={
if '${WITH_FLUID_ONLY}'== 'OFF':
package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
paddle_rt_lib_dir = 'lib'
paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
if '${MKL_SHARED_LIBS}'!= '':
paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
# put all thirdparty libraries in paddle.libs
package_data['paddle.libs']=['libwarpctc.so']
libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path)
package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
if '${WITH_MKLDNN}' == 'ON':
# change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
# The reason is that all thirdparty libraries in the same directory,
# thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
if os.system(command) != 0:
raise Exception("patchelf --set-rpath for libmkldnn.so.0 fails")
package_data['paddle.libs']+=['libmkldnn.so.0']
shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
# remove unused paddle/libs/__init__.py
os.remove(libs_path+'/__init__.py')
package_dir['paddle.libs']=libs_path
# change rpath of core.so, add $ORIGIN/../libs/ to it.
# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
if os.system(command) != 0:
raise Exception("patchelf --set-rpath for core.so fails")
setup(name='${PACKAGE_NAME}',
version='${PADDLE_VERSION}',
......@@ -128,6 +154,5 @@ setup(name='${PACKAGE_NAME}',
ext_modules=[Extension('_foo', ['stub.cc'])],
package_data=package_data,
package_dir=package_dir,
scripts=paddle_bins,
data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
scripts=paddle_bins
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册