gpu support, fix build issue:

1. Non utf-8 characters within comments of OPs may lead to protobuf fail to parse_from_string 2. comment out some ops which not supported on windows 3. cuda libs may not be correctly linked to target on windows

gpu support, fix build issue:
1. Non utf-8 characters within comments of OPs may lead to protobuf fail to parse_from_string 2. comment out some ops which not supported on windows 3. cuda libs may not be correctly linked to target on windows
1f12ba61 · peizhilin · 71d7980f · 1f12ba61 · 1f12ba61 · 1f12ba61
15 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
    # TODO(panyx0718): CUPTI only allows DSO?
    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    if(WIN32)
+      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(WIN32)
 endif(NOT WITH_DSO)
 # setting nvcc arch flags

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,10 +13,14 @@ cc_library(paddle_fluid_api
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
 # paddle_fluid_origin exclude inference api interface
 if(WIN32)
  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 endif(WIN32)
@@ -36,6 +40,9 @@ endif()
 # Create static library
 if(WIN32)
  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
 endif(WIN32)
@@ -50,6 +57,9 @@ endif()
 if(WIN32)
  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
          DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
      DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)

--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -74,7 +74,7 @@ PadConstantLikeOp Operator.
 Pad input(Y) with a pad_value, the number of values padded to the edges of each
 axis is specified by the difference of the shape of X and Y.
-((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
+((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
 each axis.
 The input should be a k-D tensor(k > 0 and k < 7). As an example:

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), "
              "Argmaxes corresponding to indices in X used "
              "for gradient computation. Only output "
-              "if arg “is_test” is false.")
+              "if arg \"is_test\" is false.")
        .AsIntermediate();
    AddAttr<float>("spatial_scale",
                   "(float, default 1.0), "

--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
 $(N, C_{out}, H_{out}, W_{out})$, where
 $$
-H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
-W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1]
 $$
 Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
 )DOC");

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -22,6 +22,10 @@ if(WITH_PYTHON)
  endif(WITH_AMD_GPU)
  if(WIN32)
+    if(WITH_GPU AND NOT WITH_DSO)
+      get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+      target_link_libraries(paddle_pybind ${cuda_modules})
+    endif(WITH_GPU AND NOT WITH_DSO)
    target_link_libraries(paddle_pybind shlwapi)
  endif(WIN32)

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -61,12 +61,13 @@ IF(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 #		COMMAND ${CMAKE_COMMAND} -E touch stub.cc
        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/libs
        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
 		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
 		COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+#		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/libs
-		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+#		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/libs
 		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
+import os
 # import all class inside framework into fluid module
 from . import framework
 from .framework import *
@@ -43,16 +44,17 @@ from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
 from . import clip
 from . import profiler
 from . import unique_name
-from . import recordio_writer
+if os.name != 'nt':
-from . import parallel_executor
+    from . import recordio_writer
-from .parallel_executor import *
+    from . import parallel_executor
+    from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + \
    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    parallel_executor.__all__ + lod_tensor.__all__ + [
+    lod_tensor.__all__ + [
        'io',
        'initializer',
        'layers',
@@ -78,7 +80,8 @@ __all__ = framework.__all__ + executor.__all__ + \
        'recordio_writer',
        'Scope',
    ]
+if os.name != 'nt':
+    __all__ += parallel_executor.__all__
 def __bootstrap__():
    """
@@ -110,12 +113,16 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'use_pinned_memory', 'check_nan_inf', 'benchmark',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'dist_threadpool_size', 'eager_delete_tensor_gb',
        'reader_queue_speed_test_mode'
    ]
+    if os.name != 'nt':
+        read_env_flags.append('warpctc_dir')
+        read_env_flags.append('cpu_deterministic')
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')
        read_env_flags.append('rpc_server_profile_period')

--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,13 +15,15 @@
 from __future__ import print_function
 import contextlib
+import os
 from .. import core
 from .. import executor
 from .. import framework
 from .. import io
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place

--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,7 +28,8 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 __all__ = [

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -536,7 +536,7 @@ class Operator(object):
    OP_WITHOUT_KERNEL_SET = {
        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine',
        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
    }

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import contextlib
 import multiprocessing
+import os
 import six
 import threading
@@ -344,8 +345,9 @@ def _copy_reader_create_op_(block, op):
    return new_op
-@templatedoc(op_type='create_recordio_file_reader')
+if os.name != 'nt':
-def open_recordio_file(filename,
+    @templatedoc(op_type='create_recordio_file_reader')
+    def open_recordio_file(filename,
                           shapes,
                           lod_levels,
                           dtypes,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -18,6 +18,7 @@ All layers just related to the neural network.
 from __future__ import print_function
 import numpy as np
+import os
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
@@ -31,12 +32,10 @@ from functools import reduce
 __all__ = [
    'fc',
    'embedding',
-    'dynamic_lstm',
    'dynamic_lstmp',
    'dynamic_gru',
    'gru_unit',
    'linear_chain_crf',
-    'crf_decoding',
    'cos_sim',
    'cross_entropy',
    'square_error_cost',
@@ -95,7 +94,6 @@ __all__ = [
    'pad',
    'pad_constant_like',
    'label_smooth',
-    'roi_pool',
    'roi_align',
    'dice_loss',
    'image_resize',
@@ -160,6 +158,10 @@ __all__ = [
    'log_loss',
    'add_position_encoding',
 ]
+if os.name != 'nt':
+    __all__.append('dynamic_lstm')
+    __all__.append('crf_decoding')
+    __all__.append('roi_pool')
 def fc(input,
@@ -334,8 +336,9 @@ def embedding(input,
    return tmp
-@templatedoc(op_type="lstm")
+if os.name != 'nt':
-def dynamic_lstm(input,
+    @templatedoc(op_type="lstm")
+    def dynamic_lstm(input,
                     size,
                     h_0=None,
                     c_0=None,
@@ -923,8 +926,9 @@ def linear_chain_crf(input, label, param_attr=None):
    return log_likelihood
-@templatedoc()
+if os.name != 'nt':
-def crf_decoding(input, param_attr, label=None):
+    @templatedoc()
+    def crf_decoding(input, param_attr, label=None):
        """
        ${comment}
@@ -5443,8 +5447,9 @@ def label_smooth(label,
    return smooth_label
-@templatedoc()
+if os.name != 'nt':
-def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    @templatedoc()
+    def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
        """
        ${comment}

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
+import os
 from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
@@ -99,12 +100,13 @@ Examples:
    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
-__all__ += ['cumsum']
+if os.name != 'nt':
+    __all__ += ['cumsum']
-_cum_sum_ = generate_layer_fn('cumsum')
+    _cum_sum_ = generate_layer_fn('cumsum')
-def cumsum(x, axis=None, exclusive=None, reverse=None):
+    def cumsum(x, axis=None, exclusive=None, reverse=None):
        locals_var = locals().keys()
        kwargs = dict()
        for name in locals_var:
@@ -114,12 +116,12 @@ def cumsum(x, axis=None, exclusive=None, reverse=None):
        return _cum_sum_(**kwargs)
-cumsum.__doc__ = _cum_sum_.__doc__ + """
+    cumsum.__doc__ = _cum_sum_.__doc__ + """
-Examples:
+    Examples:
        >>> data = fluid.layers.data(name="input", shape=[32, 784])
        >>> result = fluid.layers.cumsum(data, axis=0)
-"""
+    """
 __all__ += ['thresholded_relu']

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -180,7 +180,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
        package_data['paddle.libs']+=['libmkldnn.so.0']
        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
-os.remove(libs_path+'/__init__.py')
+if os.path.isfile(libs_path+'/__init__.py'):
+    os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
 # change rpath of core.so, add $ORIGIN/../libs/ to it.