Merge develop

fd3e32ea · sneaxiy · 0718113a · 6b4d290c · fd3e32ea · fd3e32ea
21 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,7 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})

--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -18,7 +18,7 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
  if(WITH_DSO)
    set(TENSORRT_FOUND ON)
-  endif(WITH DSO)
+  endif(WITH_DSO)
 else()
    set(TENSORRT_FOUND OFF)
 endif()

--- a/doc/fluid/dev/versioning_en.md
+++ b/doc/fluid/dev/versioning_en.md
+# Versioning (Work In Progress)
+PaddlePaddle framework follows Semantic Versioning 2.0 (semver).
+Each release has version of the following format: MAJOR.MINOR.PATCH
+(e.g. 1.2.0). Some key points:
+ * Major version number change can result in backward-incompatible changes. Codes working in old version don’t necessarily work in the new version. In addition, data, such as program model and checkpointed parameters, generated by the previous major version might not work in the new version. Tools will be attempted to be built to help the release migration.
+ * Minor version number change always maintain backward compatibility. It normally contains compatible improvements and bug fixes.
+ * Patch number change is for bug fixes.
+ * Violation of the policy are considered as bugs and should be fixed.
+### What is Covered
+* All public documented Python APIs, excluding those live in the contrib namespace.
+### What is Not Covered
+* If an API’s implementation has bugs, we reserve the rights to fix the bugs and change the behavior.
+* The Python APIs in contrib namespace.
+* The Python function and classes that start with ‘_’.
+* The offline tools.
+* The data generated by the framework, such as serialized Program model file and checkpointed variables, are subject to different versioning scheme described below.
+* C++ Inference APIs. (To be covered)
+## Data
+Data refers to the artifacts generated by the framework. Here, we specifically mean model Program file and the checkpointed variables.
+* Backward Compatibility: User sometimes generates Data at PaddlePaddle version 1.1 and expects it to be consumed by PaddlePaddle version 1.2.
+  This can happen when an new online system wants to serve an old model trained previously.
+* Forward Compatibility: User sometimes generates Data at PaddlePaddle version 1.2 and expects it to be consumed by PaddlePaddle version 1.1.
+  The can happen when an new successful research model want to be served by an old online system that is not frequently upgraded.
+### Versioning
+Data version. Data is assigned an integer version number. Version is increased when incompatible change is introduced.
+PaddlePaddle framework has an interval of Data version that it supports. PadlePaddle framework within the same major version (semver) cannot drop support of lower version of Data. Hence, a minor version change cannot drop support of Data version.
+For example, For PaddlePaddle version 1.1, it supports Program version 3 to 5. Later, Program version is increased from 5 to 6 due to addition of an attribute. As a result PaddlePaddle version 1.1 won’t be able to consume it. PaddlePaddle 1.2 should support Program version 3 to 6. PaddlePaddle can only drop support for Program version 3 until PaddlePaddle version 2.0.
+### Known Issues
+Currently, forward compatibility for new Data version is best-effort.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -116,6 +116,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
+paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -127,6 +127,9 @@ static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
 void MultiDevSSAGraphBuilder::Init() const {
+  all_vars_.clear();
+  balance_vars_.clear();
  loss_var_name_ = Get<const std::string>(kLossVarName);
  places_ = Get<const std::vector<platform::Place>>(kPlaces);
  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -40,12 +40,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                         size_t device_id) const;
  void Init() const;
- private:
-  mutable std::string loss_var_name_;
-  mutable std::vector<platform::Place> places_;
-  mutable std::vector<Scope *> local_scopes_;
-  mutable std::unordered_set<std::string> grad_names_;
 #ifdef PADDLE_WITH_CUDA
  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
@@ -95,13 +89,17 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  size_t GetAppropriateDeviceID(
      const std::vector<std::string> &var_names) const;
- private:
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
  mutable BuildStrategy strategy_;
  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
  mutable std::vector<int64_t> balance_vars_;
-  void SetCommunicationContext(OpHandleBase *op_handle,
-                               const platform::Place &p) const;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -233,30 +233,9 @@ ParallelExecutor::ParallelExecutor(
 void ParallelExecutor::BCastParamsToDevices(
    const std::unordered_set<std::string> &vars) const {
-  // the initializing bcast, all vars would be bcast from device(0),
+  // the initializing bcast, all vars would be bcast from device(0).
-  // otherwise
-  // bcast from the specified device.
-  bool initializing = member_->executor_ ? false : true;
  for (auto &var : vars) {
-    int var_dev_id = -1;
+    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
-    if (member_->executor_) {
-      auto &sharded_var_device =
-          member_->executor_->Graph().Get<details::ShardedVarDevice>(
-              details::kShardedVarDevice);
-      if (sharded_var_device.find(var) != sharded_var_device.end()) {
-        var_dev_id = sharded_var_device.at(var);
-      }
-    }
-    if (!initializing && var_dev_id == -1) continue;
-    framework::Variable *main_var = nullptr;
-    if (initializing) {
-      main_var = member_->local_scopes_[0]->FindVar(var);
-    } else {
-      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
-    }
    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
      continue;
    }
@@ -272,8 +251,7 @@ void ParallelExecutor::BCastParamsToDevices(
        auto place = member_->places_[i];
        void *buffer;
-        if ((initializing && i == 0) ||
+        if (i == 0) {
-            (!initializing && static_cast<int>(i) == var_dev_id)) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
@@ -290,29 +268,18 @@ void ParallelExecutor::BCastParamsToDevices(
        platform::NCCLGroupGuard guard;
        for (size_t i = 0; i < member_->places_.size(); ++i) {
          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
-          if (initializing) {
+          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
-            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
-                                         nccl_ctx.comm_, nccl_ctx.stream());
-          } else {
-            if (var_dev_id >= 0) {
-              platform::dynload::ncclBcast(buffers[i], numel, data_type,
-                                           var_dev_id, nccl_ctx.comm_,
-                                           nccl_ctx.stream());
-            }
-          }
        }
        member_->nccl_ctxs_->WaitAll();
      }
 #else
      PADDLE_THROW("Not compiled with CUDA");
 #endif
    } else {
      platform::CPUPlace cpu;
      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        if ((initializing && i == 0) ||
+        if (i == 0) continue;
-            (!initializing && static_cast<int>(i) == var_dev_id))
-          continue;
        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -72,9 +72,9 @@ class ParallelExecutor {
  void Run(const std::vector<std::string> &fetch_tensors,
           const std::string &fetched_var_name);
+ private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
- private:
  ParallelExecutorPrivate *member_;
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,9 +17,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-#if(APPLE)
+add_subdirectory(api)
-  add_subdirectory(api)
-#endif()
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
@@ -57,5 +55,7 @@ endif()
 if(WITH_TESTING)
  # tests/book depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
-  add_subdirectory(tests/api)
+  if(WITH_INFERENCE_API_TEST)
+    add_subdirectory(tests/api)
+  endif()  
 endif()
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -69,25 +69,4 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    endfunction()
    anakin_target(inference_anakin_api)
    anakin_target(inference_anakin_api_shared)
-    if (WITH_TESTING)
-        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
-        if(WITH_GPU)
-            set(anakin_test_extra_deps dynload_cuda)
-            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
-                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
-                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
-        endif()
-        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
-                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
-                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
-                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
-    endif(WITH_TESTING)
 endif()
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
-set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
-set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo")
+set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
+    "A path setting inference demo download directories.")
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
-function (inference_download_and_uncompress install_dir filename)
+function (inference_download install_dir url filename)
-    message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}")
+    message(STATUS "Download inference test stuff from ${url}/${filename}")
    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
    message(STATUS "finish downloading ${filename}")
-endfunction(inference_download_and_uncompress)
+endfunction()
+function (inference_download_and_uncompress install_dir url filename)
+    inference_download(${install_dir} ${url} ${filename})
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+endfunction()
 function(download_model_and_data install_dir model_name data_name)
-    if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
+    if (NOT EXISTS ${install_dir})
-        inference_download_and_uncompress(${install_dir} ${model_name})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
-        inference_download_and_uncompress(${install_dir} ${data_name})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_name})
    endif()
 endfunction()
+function(inference_analysis_api_test target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
+endfunction()
 # RNN1
-# TODO: fix this test on MACOS
-message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
 if(NOT APPLE)
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
-    inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc 
+    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+else()
-        ARGS --infer_model=${RNN1_INSTALL_DIR}/model
+    # TODO: fix this test on MACOS, the reason is that
-            --infer_data=${RNN1_INSTALL_DIR}/data.txt)
+    # fusion_seqexpand_concat_fc_op is not supported on MACOS
-endif(NOT APPLE)
+    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
+endif()
 # RNN2
 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc
+inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${RNN2_INSTALL_DIR}/model
-         --infer_data=${RNN2_INSTALL_DIR}/data.txt)
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
 download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
-inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
+inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc)
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
-        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
 # lac
 set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
 download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
+inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${LAC_INSTALL_DIR}/model
-        --infer_data=${LAC_INSTALL_DIR}/data.txt)
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
+inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/model
-         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
 # ocr
-set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
+set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
-set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
+if (NOT EXISTS ${OCR_INSTALL_DIR})
-if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
-    get_filename_component(filename ${OCR_MODEL_URL} NAME)
+endif()
-    message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}")
+inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
-    execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}")
+# anakin
-    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}")
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
-    message(STATUS "finish downloading ${filename}")
+   # anakin rnn1
+   set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
+   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
+   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
+   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
+   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc 
+           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
+           DEPS inference_anakin_api_shared SERIAL)
+   # anakin mobilenet
+   if(WITH_GPU)
+       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
+       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
+       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc 
+               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
+               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+   endif()
 endif()
-inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
-    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-    ARGS --infer_model=${OCR_INSTALL_DIR}/model
-        --infer_data=${OCR_INSTALL_DIR}/data.txt)
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
--- a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
--- a/paddle/fluid/operators/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_expand_as_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/sequence_expand_as_op.h"
+namespace paddle {
+namespace operators {
+using framework::LoDTensor;
+class SequenceExpandAsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceExpandAsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SequenceExpandAsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceExpandAsOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = x_dims;
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Dimension number of Input(X) should be at least 2.");
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+      auto& x_dim = x_var->Get<LoDTensor>().dims();
+      auto& y_lod = y_var->Get<LoDTensor>().lod();
+      PADDLE_ENFORCE_EQ(y_lod.size(), 1,
+                        "Level number of Input(Y)'s lod should be 1.");
+      PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dim[0]), y_lod[0].size() - 1,
+                        "The first dimension of Input(X) should be equal "
+                        "to the size of Input(Y)'s 0 level lod.");
+      int64_t out_first_dim = 0;
+      if (y_lod[0].size() <= 1) {
+        out_first_dim = x_dims[0];
+      } else {
+        for (size_t i = 1; i < y_lod[0].size(); ++i) {
+          out_first_dim += (y_lod[0][i] - y_lod[0][i - 1]);
+        }
+      }
+      out_dims[0] = out_first_dim;
+    } else {
+      out_dims[0] = -1;
+    }
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("Y", /*->*/ "Out");
+  }
+};
+class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
+             "level is at most 1.");
+    AddInput("Y",
+             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
+             "lod (specified level) is referred by Input(X).");
+    AddOutput("Out",
+              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
+              "generated from Input(X) by referring lod of Input(Y).");
+    AddComment(R"DOC(
+Sequence Expand As Operator.
+This operator expands `X` according to the zeroth level lod of `Y`. Current
+implementation requires the level number of Input(Y)'s lod should be 1, and
+the first dimension of Input(X) should be equal to the size of Input(Y)'s zeroth
+level lod, and lod of Input(X) is not considered.
+Following are cases to better explain how this works:
+Case 1:
+Given a 1-level LoDTensor input(X)
+    X.data = [[a], [b], [c], [d]]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0, 3, 6, 7, 8]]
+ref_level: 0
+then we get 1-level LoDTensor
+    Out.lod =  [[0,            3,              6,  7,  8]]
+    Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
+    Out.dims = [8, 1]
+Case 2:
+Given a common Tensor input(X)
+    X.data = [[a, b], [c, d], [e, f]]
+    X.dims = [3, 2]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+ref_level: 0
+then we get a common LoDTensor
+    Out.lod =  [[0,             2,     3,                    6]]
+    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
+    Out.dims = [6, 2]
+)DOC");
+  }
+};
+class SequenceExpandAsOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", x_grad_name);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp,
+                  ops::SequenceExpandAsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_as,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_as_grad,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>);
--- a/paddle/fluid/operators/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_expand_as_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include "paddle/fluid/operators/sequence_expand_as_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+template <typename T>
+static __global__ void sequence_expand_as_kernel(const T *in_data,
+                                                 const size_t *expand_offset,
+                                                 const size_t src_hight,
+                                                 const size_t src_widht,
+                                                 T *out_data) {
+  for (int h_id = blockIdx.x; h_id < src_hight; h_id += gridDim.x) {
+    int span = expand_offset[h_id + 1] - expand_offset[h_id];
+    if (span == 0) continue;
+    const T *src = in_data + h_id * src_widht;
+    for (int w_id = threadIdx.x; w_id < src_widht; w_id += blockDim.x) {
+      T ele = src[w_id];
+      int offset = expand_offset[h_id] * src_widht;
+      for (int k = 0; k < span; ++k) {
+        out_data[offset + k * src_widht + w_id] = ele;
+      }
+    }
+  }
+}
+template <typename T>
+static __global__ void sequence_expand_as_grad_kernel(
+    const T *dout_data, const size_t *expand_offset, const size_t dst_hight,
+    const size_t dst_width, T *dx_data) {
+  for (int h_id = blockIdx.x; h_id < dst_hight; h_id += gridDim.x) {
+    T *dst = dx_data + h_id * dst_width;
+    int span = expand_offset[h_id + 1] - expand_offset[h_id];
+    for (int w_id = threadIdx.x; w_id < dst_width; w_id += blockDim.x) {
+      T result = 0;
+      for (int k = 0; k < span; ++k) {
+        int offset = (expand_offset[h_id] + k) * dst_width;
+        const T *src = dout_data + offset;
+        result += src[w_id];
+      }
+      dst[w_id] = result;
+    }
+  }
+}
+template <typename T>
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(
+      const platform::CUDADeviceContext &context, const LoDTensor &x,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      LoDTensor *out) {
+    int hight = x.dims()[0];
+    int width = framework::product(x.dims()) / hight;
+    const int kThreadsPerBlock = 1024;
+    int thread_x = kThreadsPerBlock;
+    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      thread_x = ((width + 31) >> 5) << 5;
+    }
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int block_x = std::max(max_threads / thread_x, 1);
+    dim3 block_size(thread_x);
+    dim3 grid_size(block_x);
+    sequence_expand_as_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
+        out->mutable_data<T>(context.GetPlace()));
+  }
+};
+template <typename T>
+struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext &context,
+                  const LoDTensor &dout,
+                  const framework::Vector<size_t> &ref_lod, /*expand based lod*/
+                  LoDTensor *dx) {
+    int hight = dx->dims()[0];
+    int width = framework::product(dx->dims()) / hight;
+    const int kThreadsPerBlock = 1024;
+    int thread_x = kThreadsPerBlock;
+    if (width < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      thread_x = ((width + 31) >> 5) << 5;
+    }
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int block_x = std::max(max_threads / thread_x, 1);
+    dim3 block_size(thread_x);
+    dim3 grid_size(block_x);
+    sequence_expand_as_grad_kernel<<<grid_size, block_size, 0,
+                                     context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()), hight, width,
+        dx->mutable_data<T>(context.GetPlace()));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_as,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_as_grad,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>);
--- a/paddle/fluid/operators/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_expand_as_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <numeric>  // std::iota
+#include <sstream>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+struct SequenceExpandFunctor {
+  void operator()(
+      const DeviceContext &ctx, const framework::LoDTensor &x,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *out);
+};
+template <typename DeviceContext, typename T>
+struct SequenceExpandAsGradFunctor {
+  void operator()(
+      const DeviceContext &ctx, const framework::LoDTensor &dout,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *dx);
+};
+template <typename T>
+struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext &context, const framework::LoDTensor &x,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *out) {
+    int64_t hight = x.dims()[0];
+    int64_t width = framework::product(x.dims()) / hight;
+    const T *in_data = x.data<T>();
+    T *out_data = out->mutable_data<T>(context.GetPlace());
+    for (int h_id = 0; h_id < hight; ++h_id) {
+      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      if (span == 0) continue;
+      const T *src = in_data + h_id * width;
+      for (int64_t w_id = 0; w_id < width; ++w_id) {
+        T ele = src[w_id];
+        size_t offset = ref_lod[h_id] * width;
+        for (size_t k = 0; k < span; ++k) {
+          out_data[offset + k * width + w_id] = ele;
+        }
+      }
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SequenceExpandAsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::LoDTensor>("X");
+    auto *y = context.Input<framework::LoDTensor>("Y");
+    auto *out = context.Output<framework::LoDTensor>("Out");
+    auto &y_lod = y->lod();
+    PADDLE_ENFORCE_EQ(y_lod.size(), 1, "LoD of Y should be 1.");
+    PADDLE_ENFORCE_GT(y_lod[0].size(), 1, ".");
+    out->mutable_data<T>(context.GetPlace());
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    SequenceExpandFunctor<DeviceContext, T> seq_espand_functor;
+    seq_espand_functor(dev_ctx, *x, y_lod[0], out);
+  }
+};
+/*
+ *Given Grad(Out)
+ *
+ *    Grad(Out).lod = [[0,              3,            6]]
+ *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+ * Then
+ *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
+ *                 = [0.6, 1.5]
+ *    Grad(X).lod = Input(X).lod
+ *
+ * */
+template <typename T>
+struct SequenceExpandAsGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext &context,
+      const framework::LoDTensor &dout,
+      const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
+      framework::LoDTensor *dx) {
+    int64_t hight = dx->dims()[0];
+    int64_t width = framework::product(dx->dims()) / hight;
+    const T *dout_data = dout.data<T>();
+    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+    for (int64_t h_id = 0; h_id < hight; ++h_id) {
+      T *dst = dx_data + h_id * width;
+      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      for (int64_t w_id = 0; w_id < width; ++w_id) {
+        T result = 0;
+        for (size_t k = 0; k < span; ++k) {
+          size_t offset = (ref_lod[h_id] + k) * width;
+          result += dout_data[offset + w_id];
+        }
+        dst[w_id] = result;
+      }
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SequenceExpandAsGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *g_out =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *y = context.Input<framework::LoDTensor>("Y");
+    auto *g_x =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    g_x->mutable_data<T>(context.GetPlace());
+    SequenceExpandAsGradFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *g_out,
+            y->lod()[0], g_x);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -135,6 +135,8 @@ function cmake_gen() {
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
+        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
        -DPY_VERSION=${PY_VERSION:-2.7}
    ========================================
@@ -165,6 +167,8 @@ EOF
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
+        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
        -DPY_VERSION=${PY_VERSION:-2.7}
 }

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -54,6 +54,7 @@ __all__ = [
    'conv2d_transpose',
    'conv3d_transpose',
    'sequence_expand',
+    'sequence_expand_as',
    'sequence_pad',
    'lstm_unit',
    'reduce_sum',
@@ -2666,6 +2667,71 @@ def sequence_expand(x, y, ref_level=-1, name=None):
    return tmp
+def sequence_expand_as(x, y, name=None):
+    """Sequence Expand As Layer. This layer will expand the input variable **x**
+    according to the zeroth level lod of **y**. Current implementation requires
+    the level number of Input(Y)'s lod must be 1, and the first dimension of
+    Input(X) should be equal to the size of Input(Y)'s zeroth level lod, and
+    lod of Input(X) is not considered.
+    Following examples will explain how sequence_expand_as works:
+    .. code-block:: text
+        * Case 1:
+            Given a 1-level LoDTensor input(X)
+                X.data = [[a], [b], [c], [d]]
+                X.dims = [4, 1]
+            and input(Y)
+                Y.lod = [[0, 3, 6, 7, 8]]
+            ref_level: 0
+            then we get 1-level LoDTensor
+                Out.lod =  [[0,            3,              6,  7,  8]]
+                Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]]
+                Out.dims = [8, 1]
+        * Case 2:
+            Given a common Tensor input(X)
+                X.data = [[a, b], [c, d], [e, f]]
+                X.dims = [3, 2]
+            and input(Y)
+                Y.lod = [[0, 2, 3, 6]]
+            ref_level: 0
+            then we get a common LoDTensor
+                Out.lod =  [[0,             2,     3,                    6]]
+                Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
+                Out.dims = [6, 2]
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+    Examples:
+        .. code-block:: python
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand_as(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand_as', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand_as',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': tmp})
+    return tmp
 @templatedoc()
 def sequence_pad(x, pad_value, maxlen=None):
    """

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -29,8 +29,8 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
-message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext")
 if(APPLE)
+    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext")
    # this op is not support on mac
    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
    # TODO: add the unitest back when it fixed

--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -61,9 +61,7 @@ class TestDistTransformer2x2Sync(TestDistBase):
    def test_transformer(self):
        download_files()
-        #Note: loss on test dataset of the first 5 batch are:
+        self.check_with_place("dist_transformer.py", delta=1e-5)
-        # 10.518872, 10.518871, 10.518868, 10.518862, 10.518855
-        self.check_with_place("dist_transformer.py", delta=1e-7)
 class TestDistTransformer2x2Async(TestDistBase):

--- a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestSequenceExpandAs(OpTest):
+    def setUp(self):
+        self.op_type = 'sequence_expand_as'
+        self.set_data()
+        self.compute()
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[1, 3, 4]]
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
+    def compute(self):
+        x = self.inputs['X']
+        x_data, x_lod = x if type(x) == tuple else (x, None)
+        y_data, y_lod = self.inputs['Y']
+        assert len(y_lod) == 1 and len(y_lod[0]) == x_data.shape[0]
+        repeats = []
+        for i in range(len(y_lod[0])):
+            repeat_num = y_lod[0][i]
+            if repeat_num == 0:
+                continue
+            repeats.extend([i for _ in range(repeat_num)])
+        out_data = x_data[repeats]
+        self.outputs = {'Out': (out_data, y_lod)}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+class TestSequenceExpandAsCase1(TestSequenceExpandAs):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        x_lod = [[2, 3]]
+        y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float32')
+        y_lod = [[2, 2, 0, 3, 3]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+class TestSequenceExpandAsCase2(TestSequenceExpandAs):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
+        x_lod = [[1]]
+        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
+        y_lod = [[2]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+if __name__ == '__main__':
+    unittest.main()