gpu support, fix build issue:

1. Non utf-8 characters within comments of OPs may lead to protobuf fail to parse_from_string 2. comment out some ops which not supported on windows 3. cuda libs may not be correctly linked to target on windows

gpu support, fix build issue:
1. Non utf-8 characters within comments of OPs may lead to protobuf fail to parse_from_string 2. comment out some ops which not supported on windows 3. cuda libs may not be correctly linked to target on windows
1f12ba61 · peizhilin · 71d7980f · 1f12ba61 · 1f12ba61 · 1f12ba61
15 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
 if(NOT WITH_DSO)
    # TODO(panyx0718): CUPTI only allows DSO?
    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+    if(WIN32)
+      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(WIN32)
 endif(NOT WITH_DSO)
 # setting nvcc arch flags

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,10 +13,14 @@ cc_library(paddle_fluid_api
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
 # paddle_fluid_origin exclude inference api interface
 if(WIN32)
  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 endif(WIN32)
@@ -36,6 +40,9 @@ endif()
 # Create static library
 if(WIN32)
  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
 endif(WIN32)
@@ -50,6 +57,9 @@ endif()
 if(WIN32)
  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
          DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
+  if(WITH_GPU AND NOT WITH_DSO)
+    target_link_libraries(paddle_fluid_origin ${cuda_modules})
+  endif(WITH_GPU AND NOT WITH_DSO)
 else(WIN32)
  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
      DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)

--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -74,7 +74,7 @@ PadConstantLikeOp Operator.
 Pad input(Y) with a pad_value, the number of values padded to the edges of each
 axis is specified by the difference of the shape of X and Y.
-((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for
+((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for
 each axis.
 The input should be a k-D tensor(k > 0 and k < 7). As an example:

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), "
              "Argmaxes corresponding to indices in X used "
              "for gradient computation. Only output "
-              "if arg “is_test” is false.")
+              "if arg \"is_test\" is false.")
        .AsIntermediate();
    AddAttr<float>("spatial_scale",
                   "(float, default 1.0), "

--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
 $(N, C_{out}, H_{out}, W_{out})$, where
 $$
-H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
-W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1]
 $$
 Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
 )DOC");

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -22,6 +22,10 @@ if(WITH_PYTHON)
  endif(WITH_AMD_GPU)
  if(WIN32)
+    if(WITH_GPU AND NOT WITH_DSO)
+      get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
+      target_link_libraries(paddle_pybind ${cuda_modules})
+    endif(WITH_GPU AND NOT WITH_DSO)
    target_link_libraries(paddle_pybind shlwapi)
  endif(WIN32)

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -61,12 +61,13 @@ IF(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 #		COMMAND ${CMAKE_COMMAND} -E touch stub.cc
        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/libs
        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
 		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
 		COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
-		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+#		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/libs
-		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+#		COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_PYTHON_BUILD_DIR}/libs ${PADDLE_PYTHON_BUILD_DIR}/libs
 		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
+import os
 # import all class inside framework into fluid module
 from . import framework
 from .framework import *
@@ -43,16 +44,17 @@ from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
 from . import clip
 from . import profiler
 from . import unique_name
-from . import recordio_writer
+if os.name != 'nt':
-from . import parallel_executor
+    from . import recordio_writer
-from .parallel_executor import *
+    from . import parallel_executor
+    from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + \
    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    parallel_executor.__all__ + lod_tensor.__all__ + [
+    lod_tensor.__all__ + [
        'io',
        'initializer',
        'layers',
@@ -78,7 +80,8 @@ __all__ = framework.__all__ + executor.__all__ + \
        'recordio_writer',
        'Scope',
    ]
+if os.name != 'nt':
+    __all__ += parallel_executor.__all__
 def __bootstrap__():
    """
@@ -110,12 +113,16 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'use_pinned_memory', 'check_nan_inf', 'benchmark',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
+        'dist_threadpool_size', 'eager_delete_tensor_gb',
        'reader_queue_speed_test_mode'
    ]
+    if os.name != 'nt':
+        read_env_flags.append('warpctc_dir')
+        read_env_flags.append('cpu_deterministic')
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')
        read_env_flags.append('rpc_server_profile_period')

--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,13 +15,15 @@
 from __future__ import print_function
 import contextlib
+import os
 from .. import core
 from .. import executor
 from .. import framework
 from .. import io
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place

--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,7 +28,8 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-from .. import parallel_executor
+if os.name != 'nt':
+    from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 __all__ = [

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -536,7 +536,7 @@ class Operator(object):
    OP_WITHOUT_KERNEL_SET = {
        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'loadload_combine',
        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
    }

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import contextlib
 import multiprocessing
+import os
 import six
 import threading
@@ -344,70 +345,71 @@ def _copy_reader_create_op_(block, op):
    return new_op
-@templatedoc(op_type='create_recordio_file_reader')
+if os.name != 'nt':
-def open_recordio_file(filename,
+    @templatedoc(op_type='create_recordio_file_reader')
-                       shapes,
+    def open_recordio_file(filename,
-                       lod_levels,
+                           shapes,
-                       dtypes,
+                           lod_levels,
-                       pass_num=1,
+                           dtypes,
-                       for_parallel=True):
+                           pass_num=1,
-    """
+                           for_parallel=True):
-    ${comment}
+        """
+        ${comment}
-    Args:
-       filename(${filename_type}): ${filename_comment}.
+        Args:
-       shapes(list): List of tuples which declaring data shapes.
+           filename(${filename_type}): ${filename_comment}.
-       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+           shapes(list): List of tuples which declaring data shapes.
-       dtypes(list): List of strs which declaring data type.
+           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-       pass_num(int): Number of passes to run.
+           dtypes(list): List of strs which declaring data type.
-       for_parallel(Bool): Set it as True if you are going to run
+           pass_num(int): Number of passes to run.
-            subsequent operators in parallel.
+           for_parallel(Bool): Set it as True if you are going to run
+                subsequent operators in parallel.
-    Returns:
-       ${out_comment}.
+        Returns:
+           ${out_comment}.
-    Examples:
+        Examples:
-        >>> import paddle.fluid as fluid
-        >>> reader = fluid.layers.io.open_recordio_file(
+            >>> import paddle.fluid as fluid
-        >>>                               filename='./data.recordio',
+            >>> reader = fluid.layers.io.open_recordio_file(
-        >>>                               shapes=[(3,224,224), (1)],
+            >>>                               filename='./data.recordio',
-        >>>                               lod_levels=[0, 0],
+            >>>                               shapes=[(3,224,224), (1)],
-        >>>                               dtypes=['float32', 'int64'])
+            >>>                               lod_levels=[0, 0],
-        >>> # Via the reader, we can use 'read_file' layer to get data:
+            >>>                               dtypes=['float32', 'int64'])
-        >>> image, label = fluid.layers.io.read_file(reader)
+            >>> # Via the reader, we can use 'read_file' layer to get data:
-    """
+            >>> image, label = fluid.layers.io.read_file(reader)
-    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+        """
-    shape_concat = []
+        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-    ranks = []
+        shape_concat = []
+        ranks = []
-    for shape in shapes:
-        shape_concat.extend(shape)
+        for shape in shapes:
-        ranks.append(len(shape))
+            shape_concat.extend(shape)
+            ranks.append(len(shape))
-    var_name = unique_name('open_recordio_file')
+        var_name = unique_name('open_recordio_file')
-    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
+        startup_blk = default_startup_program().current_block()
-    startup_blk.append_op(
+        startup_var = startup_blk.create_var(name=var_name)
-        type='create_recordio_file_reader',
+        startup_blk.append_op(
-        outputs={'Out': [startup_var]},
+            type='create_recordio_file_reader',
-        attrs={
+            outputs={'Out': [startup_var]},
-            'shape_concat': shape_concat,
+            attrs={
-            'lod_levels': lod_levels,
+                'shape_concat': shape_concat,
-            'filename': filename,
+                'lod_levels': lod_levels,
-            'ranks': ranks
+                'filename': filename,
-        })
+                'ranks': ranks
+            })
-    startup_var.desc.set_dtypes(dtypes)
+        startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
+        startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+        main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
+                                          startup_var)
-    if pass_num > 1:
+        if pass_num > 1:
-        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
-    return monkey_patch_reader_methods(main_prog_var)
+        return monkey_patch_reader_methods(main_prog_var)
 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -18,6 +18,7 @@ All layers just related to the neural network.
 from __future__ import print_function
 import numpy as np
+import os
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
@@ -31,12 +32,10 @@ from functools import reduce
 __all__ = [
    'fc',
    'embedding',
-    'dynamic_lstm',
    'dynamic_lstmp',
    'dynamic_gru',
    'gru_unit',
    'linear_chain_crf',
-    'crf_decoding',
    'cos_sim',
    'cross_entropy',
    'square_error_cost',
@@ -95,7 +94,6 @@ __all__ = [
    'pad',
    'pad_constant_like',
    'label_smooth',
-    'roi_pool',
    'roi_align',
    'dice_loss',
    'image_resize',
@@ -160,6 +158,10 @@ __all__ = [
    'log_loss',
    'add_position_encoding',
 ]
+if os.name != 'nt':
+    __all__.append('dynamic_lstm')
+    __all__.append('crf_decoding')
+    __all__.append('roi_pool')
 def fc(input,
@@ -334,126 +336,127 @@ def embedding(input,
    return tmp
-@templatedoc(op_type="lstm")
+if os.name != 'nt':
-def dynamic_lstm(input,
+    @templatedoc(op_type="lstm")
-                 size,
+    def dynamic_lstm(input,
-                 h_0=None,
+                     size,
-                 c_0=None,
+                     h_0=None,
-                 param_attr=None,
+                     c_0=None,
-                 bias_attr=None,
+                     param_attr=None,
-                 use_peepholes=True,
+                     bias_attr=None,
-                 is_reverse=False,
+                     use_peepholes=True,
-                 gate_activation='sigmoid',
+                     is_reverse=False,
-                 cell_activation='tanh',
+                     gate_activation='sigmoid',
-                 candidate_activation='tanh',
+                     cell_activation='tanh',
-                 dtype='float32',
+                     candidate_activation='tanh',
-                 name=None):
+                     dtype='float32',
-    """
+                     name=None):
-    ${comment}
+        """
+        ${comment}
-    Args:
-        input (Variable): ${input_comment}
+        Args:
-        size (int): 4 * hidden size.
+            input (Variable): ${input_comment}
-        h_0(Variable): The initial hidden state is an optional input, default is zero.
+            size (int): 4 * hidden size.
-                       This is a tensor with shape (N x D), where N is the
+            h_0(Variable): The initial hidden state is an optional input, default is zero.
-                       batch size and D is the hidden size.
+                           This is a tensor with shape (N x D), where N is the
-        c_0(Variable): The initial cell state is an optional input, default is zero.
+                           batch size and D is the hidden size.
-                       This is a tensor with shape (N x D), where N is the
+            c_0(Variable): The initial cell state is an optional input, default is zero.
-                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+                           This is a tensor with shape (N x D), where N is the
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
-                               hidden-hidden weights.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                                   hidden-hidden weights.
-                               - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                W_{fh}, W_{oh}`}
+                                   - Weights = {:math:`W_{ch}, W_{ih}, \
-                               - The shape is (D x 4D), where D is the hidden
+                                                    W_{fh}, W_{oh}`}
-                                 size.
+                                   - The shape is (D x 4D), where D is the hidden
+                                     size.
-                               If it is set to None or one attribute of ParamAttr,
-                               dynamic_lstm will create ParamAttr as param_attr.
+                                   If it is set to None or one attribute of ParamAttr,
-                               If the Initializer of the param_attr is not set, the
+                                   dynamic_lstm will create ParamAttr as param_attr.
-                               parameter is initialized with Xavier. Default: None.
+                                   If the Initializer of the param_attr is not set, the
-        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                                   parameter is initialized with Xavier. Default: None.
-                              weights, which contains two parts, input-hidden
+            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                              bias weights and peephole connections weights if
+                                  weights, which contains two parts, input-hidden
-                              setting `use_peepholes` to `True`.
+                                  bias weights and peephole connections weights if
+                                  setting `use_peepholes` to `True`.
-                              1. `use_peepholes = False`
-                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                  1. `use_peepholes = False`
-                                 - The shape is (1 x 4D).
+                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                              2. `use_peepholes = True`
+                                     - The shape is (1 x 4D).
-                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                  2. `use_peepholes = True`
-                                                 W_{fc}, W_{oc}`}.
+                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                 - The shape is (1 x 7D).
+                                                     W_{fc}, W_{oc}`}.
+                                     - The shape is (1 x 7D).
-                              If it is set to None or one attribute of ParamAttr,
-                              dynamic_lstm will create ParamAttr as bias_attr.
+                                  If it is set to None or one attribute of ParamAttr,
-                              If the Initializer of the bias_attr is not set,
+                                  dynamic_lstm will create ParamAttr as bias_attr.
-                              the bias is initialized zero. Default: None.
+                                  If the Initializer of the bias_attr is not set,
-        use_peepholes (bool): ${use_peepholes_comment}
+                                  the bias is initialized zero. Default: None.
-        is_reverse (bool): ${is_reverse_comment}
+            use_peepholes (bool): ${use_peepholes_comment}
-        gate_activation (str): ${gate_activation_comment}
+            is_reverse (bool): ${is_reverse_comment}
-        cell_activation (str): ${cell_activation_comment}
+            gate_activation (str): ${gate_activation_comment}
-        candidate_activation (str): ${candidate_activation_comment}
+            cell_activation (str): ${cell_activation_comment}
-        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+            candidate_activation (str): ${candidate_activation_comment}
-        name (str|None): A name for this layer(optional). If set None, the layer
+            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-                         will be named automatically.
+            name (str|None): A name for this layer(optional). If set None, the layer
+                             will be named automatically.
-    Returns:
-        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        Returns:
-        is (T x D), and lod is the same with the `input`.
+            tuple: The hidden state, and cell state of LSTM. The shape of both \
+            is (T x D), and lod is the same with the `input`.
-    Examples:
-        .. code-block:: python
+        Examples:
+            .. code-block:: python
-            hidden_dim = 512
-            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                hidden_dim = 512
-                                           bias_attr=False)
+                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-            forward, _ = fluid.layers.dynamic_lstm(
+                                               bias_attr=False)
-                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+                forward, _ = fluid.layers.dynamic_lstm(
-    """
+                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+        """
-    helper = LayerHelper('lstm', **locals())
+        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-    size = size // 4
+        helper = LayerHelper('lstm', **locals())
-    weight = helper.create_parameter(
+        size = size // 4
-        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+        weight = helper.create_parameter(
-    bias_size = [1, 7 * size]
+            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-    if not use_peepholes:
+        bias_size = [1, 7 * size]
-        bias_size[1] = 4 * size
+        if not use_peepholes:
-    bias = helper.create_parameter(
+            bias_size[1] = 4 * size
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-    hidden = helper.create_variable_for_type_inference(dtype)
+        hidden = helper.create_variable_for_type_inference(dtype)
-    cell = helper.create_variable_for_type_inference(dtype)
+        cell = helper.create_variable_for_type_inference(dtype)
-    batch_gate = helper.create_variable_for_type_inference(dtype)
+        batch_gate = helper.create_variable_for_type_inference(dtype)
-    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    batch_size = input.shape[0]
+        batch_size = input.shape[0]
-    if h_0:
+        if h_0:
-        assert h_0.shape == (batch_size, size), \
+            assert h_0.shape == (batch_size, size), \
-            'The shape of h0 should be (batch_size, %d)' % size
+                'The shape of h0 should be (batch_size, %d)' % size
-        inputs['H0'] = h_0
+            inputs['H0'] = h_0
-    if c_0:
+        if c_0:
-        assert c_0.shape == (batch_size, size), \
+            assert c_0.shape == (batch_size, size), \
-            'The shape of c0 should be (batch_size, %d)' % size
+                'The shape of c0 should be (batch_size, %d)' % size
-        inputs['C0'] = c_0
+            inputs['C0'] = c_0
-    helper.append_op(
+        helper.append_op(
-        type='lstm',
+            type='lstm',
-        inputs=inputs,
+            inputs=inputs,
-        outputs={
+            outputs={
-            'Hidden': hidden,
+                'Hidden': hidden,
-            'Cell': cell,
+                'Cell': cell,
-            'BatchGate': batch_gate,
+                'BatchGate': batch_gate,
-            'BatchCellPreAct': batch_cell_pre_act
+                'BatchCellPreAct': batch_cell_pre_act
-        },
+            },
-        attrs={
+            attrs={
-            'use_peepholes': use_peepholes,
+                'use_peepholes': use_peepholes,
-            'is_reverse': is_reverse,
+                'is_reverse': is_reverse,
-            'gate_activation': gate_activation,
+                'gate_activation': gate_activation,
-            'cell_activation': cell_activation,
+                'cell_activation': cell_activation,
-            'candidate_activation': candidate_activation
+                'candidate_activation': candidate_activation
-        })
+            })
-    return hidden, cell
+        return hidden, cell
 def dynamic_lstmp(input,
@@ -923,39 +926,40 @@ def linear_chain_crf(input, label, param_attr=None):
    return log_likelihood
-@templatedoc()
+if os.name != 'nt':
-def crf_decoding(input, param_attr, label=None):
+    @templatedoc()
-    """
+    def crf_decoding(input, param_attr, label=None):
-    ${comment}
+        """
+        ${comment}
-    Args:
+        Args:
-        input(${emission_type}): ${emission_comment}
+            input(${emission_type}): ${emission_comment}
-        param_attr(ParamAttr): The parameter attribute for training.
+            param_attr(ParamAttr): The parameter attribute for training.
-        label(${label_type}): ${label_comment}
+            label(${label_type}): ${label_comment}
-    Returns:
+        Returns:
-        Variable: ${viterbi_path_comment}
+            Variable: ${viterbi_path_comment}
-    Examples:
+        Examples:
-        .. code-block:: python
+            .. code-block:: python
-           crf_decode = layers.crf_decoding(
+               crf_decode = layers.crf_decoding(
-                input=hidden, param_attr=ParamAttr(name="crfw"))
+                    input=hidden, param_attr=ParamAttr(name="crfw"))
-    """
+        """
-    helper = LayerHelper('crf_decoding', **locals())
+        helper = LayerHelper('crf_decoding', **locals())
-    transition = helper.get_parameter(param_attr.name)
+        transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_variable_for_type_inference(
+        viterbi_path = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+            dtype=helper.input_dtype())
-    helper.append_op(
+        helper.append_op(
-        type='crf_decoding',
+            type='crf_decoding',
-        inputs={"Emission": [input],
+            inputs={"Emission": [input],
-                "Transition": transition,
+                    "Transition": transition,
-                "Label": label},
+                    "Label": label},
-        outputs={"ViterbiPath": [viterbi_path]})
+            outputs={"ViterbiPath": [viterbi_path]})
-    return viterbi_path
+        return viterbi_path
 @templatedoc()
@@ -5443,42 +5447,43 @@ def label_smooth(label,
    return smooth_label
-@templatedoc()
+if os.name != 'nt':
-def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    @templatedoc()
-    """
+    def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
-    ${comment}
+        """
+        ${comment}
-    Args:
+        Args:
-        input (Variable): ${x_comment}
+            input (Variable): ${x_comment}
-        rois (Variable): ROIs (Regions of Interest) to pool over.
+            rois (Variable): ROIs (Regions of Interest) to pool over.
-        pooled_height (integer): ${pooled_height_comment} Default: 1
+            pooled_height (integer): ${pooled_height_comment} Default: 1
-        pooled_width (integer): ${pooled_width_comment} Default: 1
+            pooled_width (integer): ${pooled_width_comment} Default: 1
-        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-    Returns:
+        Returns:
-        Variable: ${out_comment}.
+            Variable: ${out_comment}.
-    Examples:
+        Examples:
-        .. code-block:: python
+            .. code-block:: python
-            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-    """
+        """
-    helper = LayerHelper('roi_pool', **locals())
+        helper = LayerHelper('roi_pool', **locals())
-    dtype = helper.input_dtype()
+        dtype = helper.input_dtype()
-    pool_out = helper.create_variable_for_type_inference(dtype)
+        pool_out = helper.create_variable_for_type_inference(dtype)
-    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
+        helper.append_op(
-        type="roi_pool",
+            type="roi_pool",
-        inputs={"X": input,
+            inputs={"X": input,
-                "ROIs": rois},
+                    "ROIs": rois},
-        outputs={"Out": pool_out,
+            outputs={"Out": pool_out,
-                 "Argmax": argmaxes},
+                     "Argmax": argmaxes},
-        attrs={
+            attrs={
-            "pooled_height": pooled_height,
+                "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
+                "pooled_width": pooled_width,
-            "spatial_scale": spatial_scale
+                "spatial_scale": spatial_scale
-        })
+            })
-    return pool_out
+        return pool_out
 @templatedoc()

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function
+import os
 from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_
@@ -99,27 +100,28 @@ Examples:
    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
-__all__ += ['cumsum']
+if os.name != 'nt':
+    __all__ += ['cumsum']
-_cum_sum_ = generate_layer_fn('cumsum')
+    _cum_sum_ = generate_layer_fn('cumsum')
-def cumsum(x, axis=None, exclusive=None, reverse=None):
+    def cumsum(x, axis=None, exclusive=None, reverse=None):
-    locals_var = locals().keys()
+        locals_var = locals().keys()
-    kwargs = dict()
+        kwargs = dict()
-    for name in locals_var:
+        for name in locals_var:
-        val = locals()[name]
+            val = locals()[name]
-        if val is not None:
+            if val is not None:
-            kwargs[name] = val
+                kwargs[name] = val
-    return _cum_sum_(**kwargs)
+        return _cum_sum_(**kwargs)
-cumsum.__doc__ = _cum_sum_.__doc__ + """
-Examples:
-    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    cumsum.__doc__ = _cum_sum_.__doc__ + """
-    >>> result = fluid.layers.cumsum(data, axis=0)
+    Examples:
-"""
+        >>> data = fluid.layers.data(name="input", shape=[32, 784])
+        >>> result = fluid.layers.cumsum(data, axis=0)
+    """
 __all__ += ['thresholded_relu']

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -180,7 +180,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
        package_data['paddle.libs']+=['libmkldnn.so.0']
        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
-os.remove(libs_path+'/__init__.py')
+if os.path.isfile(libs_path+'/__init__.py'):
+    os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
 # change rpath of core.so, add $ORIGIN/../libs/ to it.