Merge pull request #1084 from codeWorm2015/opencl

Opencl

Merge pull request #1084 from codeWorm2015/opencl
Opencl
3eef149e · Ray Liu · GitHub · 761cc7f1 · 2fa2871d · 3eef149e
21 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ option(USE_OPENMP "openmp support" OFF)
 project(paddle-mobile)
 option(DEBUGING "enable debug mode" ON)
-option(USE_EXCEPTION "use std exception" OFF)
+option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" OFF)
 # select the platform to build
 option(CPU "armv7 with neon" OFF)

--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -52,7 +52,7 @@ class CLEngine {
      cl_context context, std::string file_name) {
    FILE *file = fopen(file_name.c_str(), "rb");
    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                          filename.c_str());
+                          file_name.c_str());
    fseek(file, 0, SEEK_END);
    int64_t size = ftell(file);
    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");

--- a/src/framework/cl/cl_half.cpp
+++ b/src/framework/cl/cl_half.cpp
@@ -488,7 +488,7 @@ static const uint8_t shifttable[512] = {
    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
 half_t float2half(float f) {
-  uint32_t v = *reinterpret_cast<uint32_t*>(&f);
+  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
  return basetable[(v >> 23) & 0x1ff] +
         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
 }
@@ -496,5 +496,17 @@ half_t float2half(float f) {
 float half2float(half_t h) {
  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
               exponenttable[h >> 10];
-  return *reinterpret_cast<float*>(&v);
+  return *reinterpret_cast<float *>(&v);
+}
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    h_array[i] = float2half(f_array[i]);
+  }
+}
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    f_array[i] = float2half(h_array[i]);
+  }
 }
--- a/src/framework/cl/cl_half.h
+++ b/src/framework/cl/cl_half.h
@@ -18,4 +18,9 @@ limitations under the License. */
 typedef uint16_t half_t;
 half_t float2half(float f);
 float half2float(half_t h);
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <type_traits>
 #include <vector>
+#include "common/log.h"
 #include "framework/cl/cl_deleter.h"
 #include "framework/cl/cl_image.h"
 #include "framework/cl/cl_scope.h"
@@ -32,11 +33,16 @@ class CLHelper {
  explicit CLHelper(CLScope *scope) : scope_(scope) {}
  void AddKernel(const std::string &kernel_name, const std::string &file_name) {
+    DLOG << " begin add kernel ";
    auto kernel = scope_->GetKernel(kernel_name, file_name);
+    DLOG << " add kernel ing ";
    kernels.emplace_back(std::move(kernel));
  }
-  cl_kernel KernelAt(const int index) { return kernels[index].get(); }
+  cl_kernel KernelAt(const int index) {
+    DLOG << " kernel count: " << kernels.size();
+    return kernels[index].get();
+  }
  cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -17,7 +17,9 @@ limitations under the License. */
 #include <vector>
 #include "CL/cl.h"
 #include "framework/cl/cl_half.h"
+#include "framework/cl/cl_tool.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"
@@ -59,6 +61,7 @@ class CLImage {
      PADDLE_MOBILE_THROW_EXCEPTION(
          " empty image tensor data shouldn't have value");
    }
+    DLOG << " init empty image ";
    InitCLImage(context, nullptr, dim);
    initialized_ = true;
  }
@@ -98,7 +101,8 @@ class CLImage {
  T *data() const {
    if (initialized_) {
      PADDLE_MOBILE_THROW_EXCEPTION(
-          " cl image has initialized, tensor data has been deleted ");
+          " cl image has initialized, tensor data has been deleted, can't use "
+          "tensor data");
    }
    return reinterpret_cast<T *>(tensor_data_);
  }
@@ -115,6 +119,7 @@ class CLImage {
 private:
  void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) {
+    DLOG << " tensor dim: " << dim;
    cl_image_format cf = {.image_channel_order = CL_RGBA,
                          .image_channel_data_type = CL_HALF_FLOAT};
    // NCHW -> [W * (C+3)/4, H * N]
@@ -132,28 +137,22 @@ class CLImage {
        tensor_data_[i] = 0;
      }
    }
-    size_t N, C, H, W;
-    if (tensor_dims_.size() == 4) {
+    size_t new_dims[] = {1, 1, 1, 1};
-      N = tensor_dims_[0];
-      if (N < 0) {
+    for (int j = 0; j < dim.size(); ++j) {
-        N = 1;
+      new_dims[4 - dim.size() + j] = dim[j];
    }
-      C = tensor_dims_[1];
-      H = tensor_dims_[2];
-      W = tensor_dims_[3];
-      width_of_one_block_ = W;
+    size_t N, C, H, W;
-      height_of_one_block_ = H;
-    } else if (tensor_dims_.size() == 1) {
+    N = new_dims[0];
-      N = 1;
+    C = new_dims[1];
-      C = tensor_dims_[0];
+    H = new_dims[2];
-      H = 1;
+    W = new_dims[3];
-      W = 1;
    width_of_one_block_ = W;
    height_of_one_block_ = H;
-    }
    size_t width = W * ((C + 3) / 4);
    size_t height = H * N;
@@ -193,9 +192,12 @@ class CLImage {
      }
    }
    cl_int err;
+    DLOG << " image width: " << width;
+    DLOG << " image height: " << height;
    cl_image_ = clCreateImage2D(
        context,  // cl_context context
-        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,  // cl_mem_flags flags
+        CL_MEM_READ_WRITE |
+            (imageData ? CL_MEM_COPY_HOST_PTR : 0),  // cl_mem_flags flags
        &cf,     // const cl_image_format *image_format
        width,   // size_t image_width
        height,  // size_t image_height
@@ -205,6 +207,7 @@ class CLImage {
    if (err != CL_SUCCESS) {
      // TODO(HaiPeng): error handling
+      CL_CHECK_ERRORS(err);
      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
    }
  }
@@ -222,9 +225,15 @@ class CLImage {
  cl_context context_;
 };
-void TensorToCLImage(Tensor *tensor, CLImage *image,cl_command_queue commandQueue);
+void TensorToCLImage(Tensor *tensor, CLImage *image,
+                     cl_command_queue commandQueue);
+void CLImageToTensor(CLImage *image, Tensor *tensor,
+                     cl_command_queue commandQueue);
-void CLImageToTensor(CLImage *image, Tensor *tensor,cl_command_queue commandQueue);
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const CLImage &image);
+#endif
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -40,8 +40,11 @@ class CLScope {
  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
      const std::string &kernel_name, const std::string &file_name) {
    auto program = Program(file_name);
+    DLOG << " get program ~ ";
    std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
-        clCreateKernel(program, kernel_name.c_str(), NULL));
+        clCreateKernel(program, kernel_name.c_str(), &status_));
+    CL_CHECK_ERRORS(status_);
+    DLOG << " create kernel ~ ";
    return std::move(kernel);
  }
@@ -58,11 +61,12 @@ class CLScope {
    status_ =
        clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
    CL_CHECK_ERRORS(status_);
    programs_[file_name] = std::move(program);
-    return program.get();
+    return programs_[file_name].get();
  }
 private:

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -37,6 +37,8 @@ limitations under the License. */
 #include "framework/cl/cl_image.h"
 #endif
+int debug_to = 2;
 namespace paddle_mobile {
 namespace framework {
@@ -85,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
+    for (int j = 0; j < debug_to; ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
      DLOG << "create op: " << j << "  " << op->Type();
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
@@ -414,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    }
  }
 #else
-  for (int i = 0; i < ops.size(); i++) {
+  for (int i = 0; i < debug_to; i++) {
 #ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
@@ -428,6 +430,11 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
 #endif
  }
 #endif
+  DLOG << " predict return nullptr";
+  return nullptr;
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
@@ -647,6 +654,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
+  if (output_tensor != nullptr) {
    Executor<Dtype, P>::Ptype *output_ptr =
        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
@@ -654,6 +662,10 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
      result_vector.push_back(output_ptr[j]);
    }
    return result_vector;
+  } else {
+    DLOG << "return  empty vector";
+    return {};
+  }
 }
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -57,7 +57,10 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
 template <typename Dtype>
 void OperatorBase<Dtype>::Run() {
+  DLOG << " begin run " << type_;
  RunImpl();
+  DLOG << " end run " << type_;
+  return;
 #ifdef PADDLE_MOBILE_DEBUG
  DLOG << "-------------" << type_ << "----------------------------";
  vector<string> input_keys = GetInputKeys();
@@ -100,8 +103,9 @@ void OperatorBase<Dtype>::Run() {
 #ifdef PADDLE_MOBILE_CL
        if (type_ == "fetch") {
          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
-          if (tensor)
+          if (tensor) {
            DLOG << type_ << " output- " << key << "=" << tensor->dims();
+          }
        } else {
          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
          //          cl_command_queue commandQueue =

--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -12,4 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "conv_kernel.inc.cl"
+//#include "conv_kernel.inc.cl"
+__kernel void conv_3x3() {}
\ No newline at end of file
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -24,9 +24,16 @@ namespace operators {
 template <>
 bool ConvAddBNReluKernel<GPU_CL, float>::Init(
    FusionConvAddBNReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Filter()->InitCLImage(cl_helper_.CLContext());
+  param->Bias()->InitCLImage(cl_helper_.CLContext());
  //  const CL *mean = param->InputMean();
  const framework::CLImage *mean = param->InputMean();
  const framework::CLImage *variance = param->InputVariance();
  const framework::CLImage *scale = param->InputScale();
  const framework::CLImage *bias = param->InputBias();
@@ -52,9 +59,6 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
  }
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
  framework::CLImage *new_scale = new framework::CLImage();
  new_scale->SetTensorData(new_scale_ptr, variance->dims());
@@ -68,6 +72,9 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
  PADDLE_MOBILE_ENFORCE(
      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
          param->Paddings()[0] == param->Paddings()[1],

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -25,6 +25,9 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
          param->Paddings()[0] == param->Paddings()[1],
      "need equal");
+  param->Filter()->InitCLImage(cl_helper_.CLContext());
+  param->Bias()->InitCLImage(cl_helper_.CLContext());
  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
               static_cast<int>(param->Paddings()[1]);
  param->SetOffset(offset);

--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -26,18 +26,32 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
          param->Paddings()[0] == param->Paddings()[1],
      "need equal");
+  param->Filter()->InitCLImage(cl_helper_.CLContext());
  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
               static_cast<int>(param->Paddings()[1]);
  param->SetOffset(offset);
+  DLOG << " init helper: " << &cl_helper_;
+  DLOG << " conv kernel add kernel ~ ";
+  DLOG << " width of one block: " << param->Filter()->WidthOfOneBlock();
+  DLOG << " height of one block: " << param->Filter()->HeightOfOneBlock();
+  DLOG << " filter dims: " << param->Filter()->dims();
  if (param->Filter()->WidthOfOneBlock() == 1 &&
      param->Filter()->HeightOfOneBlock() == 1) {
-    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+    DLOG << " here1 ";
+    this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
  } else if (param->Filter()->dims()[1] == 1) {
-    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+    DLOG << " here2 ";
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_kernel.cl");
  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
             param->Filter()->HeightOfOneBlock() == 3) {
-    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+    DLOG << " here3 ";
+    this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
  }
@@ -47,14 +61,27 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
+  DLOG << " Compute helper: " << &cl_helper_;
+  DLOG << " begin compute ";
  auto kernel = this->cl_helper_.KernelAt(0);
+  DLOG << " get work size ";
  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  DLOG << " end work size ";
  int c_block = default_work_size[0];
  int w = default_work_size[1];
  int nh = default_work_size[2];
  auto input = param.Input()->GetCLImage();
+  DLOG << " get Input ";
  auto filter = param.Filter()->GetCLImage();
+  DLOG << " get Filter ";
  auto output = param.Output();
+  DLOG << " get Output ";
  int stride = param.Strides()[0];
  int offset = param.Offset();
  int input_c = param.Input()->CBlock();
@@ -64,6 +91,8 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
  cl_int status;
+  DLOG << " begin set kernel arg ";
  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
@@ -77,12 +106,18 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  DLOG << " end set kernel arg ";
  CL_CHECK_ERRORS(status);
+  DLOG << " begin enqueue ";
  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                             default_work_size.data(), NULL, 0, NULL, NULL);
+  DLOG << " end enqueue ";
  CL_CHECK_ERRORS(status);
 }

--- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
@@ -27,6 +27,7 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
          param->Paddings()[0] == param->Paddings()[1],
      "need equal");
+  param->Filter()->InitCLImage(cl_helper_.CLContext());
  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
               static_cast<int>(param->Paddings()[1]);
  param->SetOffset(offset);

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -948,6 +948,7 @@ class FetchParam : public OpParam {
    input_x_ = InputXFrom<GType>(inputs, scope);
    out_ = OutFrom(outputs, scope);
  }
  const RType *InputX() const { return input_x_; }
  Tensor *Out() const { return out_; }

--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -34,23 +34,24 @@ int main() {
    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
+    //    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
+    //        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
+    //    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //              << std::distance(std::begin(vec_result), biggest) <<
+    //              std::endl;
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
+    //    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
+    //    }
-    auto time3 = paddle_mobile::time();
+    //    auto time3 = paddle_mobile::time();
-    for (int i = 0; i < 10; ++i) {
+    //    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
+    //    }
-    DLOG << vec_result;
+    //    DLOG << vec_result;
-    auto time4 = paddle_mobile::time();
+    //    auto time4 = paddle_mobile::time();
-    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10
+    //    std::cout << "predict cost :" << paddle_mobile::time_diff(time3,
-              << "ms" << std::endl;
+    //    time4) / 10 << "ms"
+    //              << std::endl;
  }
  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "

--- a/tools/web-exporter/CMakeLists.txt
+++ b/tools/web-exporter/CMakeLists.txt
-cmake_minimum_required(VERSION 3.6)
-project(web-exporter)
-set(CMAKE_CXX_STANDARD 11)
-file(GLOB PADDLE_MOBILE_CPP_FILES
-  "../../src/common/*.c"
-  "../../src/common/*.cpp"
-  "../../src/memory/*.cpp"
-  "../../src/framework/*.c"
-  "../../src/framework/*.cpp"
-  "../../src/framework/program/*.cpp"
-  "../../src/framework/program/program-optimize/*.cpp"
-)
-file(GLOB EXPORT_CPP_FILES "*.cpp")
-add_executable(web-exporter ${PADDLE_MOBILE_CPP_FILES} ${EXPORT_CPP_FILES})
-target_include_directories(web-exporter PRIVATE "../../src")
-target_link_libraries(web-exporter)
\ No newline at end of file
--- a/tools/web-exporter/export-nodejs.cpp
+++ b/tools/web-exporter/export-nodejs.cpp
-#include "export.h"
-inline std::string indent(int i) {
-  return std::string(i, ' ');
-}
-void export_nodejs(ProgramPtr program, ScopePtr scope, std::ostream & os) {
-  os << "module.exports.program = {\n";
-  os << indent(2) << var2str("blocks") << ": [\n";
-  for (const auto& block: program->Blocks()) {
-    os << indent(4) << "{\n";
-    os << indent(6) << var2str("vars") << ": {\n";
-    for (const auto& var: block->Vars()) {
-      const auto& dim = var->Tensor_desc().Dims();
-      os << indent(8) << var2str(var->Name()) << ": {\n";
-      os << indent(10) << var2str("dim") << ": " << var2str(dim) << ",\n";
-      os << indent(10) << var2str("persistable") << ": " << var2str(var->Persistable()) << "\n";
-      os << indent(8) << "},\n";
-    }
-    os << indent(6) << "},\n";
-    os << indent(6) << var2str("ops") << ": [\n";
-    for (const auto& op: block->Ops()) {
-      os << indent(8) << "{\n";
-      os << indent(10) << var2str("type") << ": " << var2str(op->Type()) << ",\n";
-      os << indent(10) << var2str("inputs") << ": {\n";
-      for (const auto& kv: op->GetInputs()) {
-        os << indent(12) << var2str(kv.first) << ": " << var2str(kv.second) << ",\n";
-      }
-      os << indent(10) << "},\n";
-      os << indent(10) << var2str("outputs") << ": {\n";
-      for (const auto& kv: op->GetInputs()) {
-        os << indent(12) << var2str(kv.first) << ": " << var2str(kv.second) << ",\n";
-      }
-      os << indent(10) << "},\n";
-      os << indent(10) << var2str("attrs") << ": {\n";
-      for (const auto& kv: op->GetAttrMap()) {
-        os << indent(12) << var2str(kv.first) << ": ";
-        os << decltype(kv.second)::ApplyVistor(VarVisitor(), kv.second) << ",\n";
-      }
-      os << indent(10) << "},\n";
-      os << indent(8) << "},\n";
-    }
-    os << indent(6) << "],\n";
-    os << indent(4) << "},\n";
-  }
-  os << indent(2) << "]\n";
-  os << "}\n";
-}
--- a/tools/web-exporter/export-scope.cpp
+++ b/tools/web-exporter/export-scope.cpp
-#include <cstdio>
-#include "export.h"
-void export_scope(ProgramPtr program, ScopePtr scope, const std::string & dirname) {
-  for (const auto& block: program->Blocks()) {
-    for (const auto& var: block->Vars()) {
-        if (var->Name() == "feed" || var->Name() == "fetch") {
-          continue;
-        }
-        if (var->Persistable()) {
-          auto* v = scope->FindVar(var->Name());
-          assert(v != nullptr);
-          int count = 1;
-          for (auto n: var->Tensor_desc().Dims()) {
-            count *= n;
-          }
-          auto* tensor = v->GetMutable<paddle_mobile::framework::LoDTensor>();
-          const float * p = tensor->mutable_data<float>();
-          std::string para_file_name = dirname + '/' + var->Name();
-          FILE *para_file = fopen(para_file_name.c_str(), "w");
-          assert(p != nullptr);
-          fwrite(p, sizeof(float), count, para_file);
-          fclose(para_file);
-          // std::cout << "==> " << var->Name() << " " << count << "\n";
-          // for (int i = 0; i < count; i++) {
-          //     std::cout << p[i] << ", ";
-          // }
-          // std::cout << "\n";
-        }
-    }
-  }
-}
--- a/tools/web-exporter/export.cpp
+++ b/tools/web-exporter/export.cpp
-#include "export.h"
-#include <sys/stat.h>
-#include <sys/types.h>
-class FakeExecutor : public paddle_mobile::framework::Executor<paddle_mobile::CPU, paddle_mobile::Precision::FP32> {
-public:
-  FakeExecutor(const paddle_mobile::framework::Program<paddle_mobile::CPU> p) {
-    program_ = p;
-    batch_size_ = 1;
-    use_optimize_ = true;
-    loddable_ = false;
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    auto *variable_ptr = program_.scope->Var("batch_size");
-    variable_ptr[0].SetValue<int>(1);
-    if (program_.combined) {
-      InitCombineMemory();
-    } else {
-      InitMemory();
-    }
-  }
-};
-int main(int argc, char** argv) {
-  if (argc != 3) {
-    std::cout << "Usage: " << argv[0] << " <combined-modle-dir> <output-dir>\n";
-    return -1;
-  }
-  std::string model_dir = argv[1];
-  std::string model_path = model_dir + "/model";
-  std::string para_path = model_dir + "/params";
-  std::string out_dir = argv[2];
-  std::string out_model_js = out_dir + "/model.js";
-  std::string out_para_dir = out_dir + "/paras";
-  mkdir(out_dir.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);
-  mkdir(out_para_dir.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);
-  std::cout << "loading " << model_path << " & " << para_path << "\n";
-  paddle_mobile::framework::Loader<> loader;
-  auto program = loader.Load(model_path, para_path, true);
-  FakeExecutor executor(program);
-  auto optimizedProgram = program.optimizeProgram;
-  export_scope(optimizedProgram, program.scope, out_para_dir);
-  std::ofstream fs(out_model_js.c_str());
-  export_nodejs(optimizedProgram, program.scope, fs);
-  fs.close();
-  return 0;
-}
--- a/tools/web-exporter/export.h
+++ b/tools/web-exporter/export.h
-#pragma once
-#include <iostream>
-#include <vector>
-#include <memory>
-#include <string>
-#include <ostream>
-#include <fstream>
-#include "framework/loader.h"
-#include "framework/executor.h"
-#include "framework/scope.h"
-#include "framework/program/program_desc.h"
-// using paddle_mobile::framework::ProgramDesc;
-// using paddle_mobile::framework::Scope;
-using ProgramPtr = std::shared_ptr<paddle_mobile::framework::ProgramDesc>;
-using ScopePtr = std::shared_ptr<paddle_mobile::framework::Scope>;
-void export_nodejs(ProgramPtr program, ScopePtr scope, std::ostream & os = std::cout);
-void export_scope(ProgramPtr program, ScopePtr scope, const std::string & dirname = ".");
-template <typename T>
-inline std::string var2str(const T & v) {
-  return std::to_string(v);
-}
-template <>
-inline std::string var2str(const std::string & v) {
-  return "\"" + v + "\"";
-}
-inline std::string var2str(const char* v) {
-  return var2str<std::string>(v);
-}
-inline std::string var2str(const bool v) {
-  return v ? "true" : "false";
-}
-template <typename T>
-std::string var2str(const std::vector<T> & v) {
-  std::string r = "[";
-  auto s = v.size();
-  for (int i = 0; i < s; i++) {
-    if (i) r += ", ";
-    r += var2str(v[i]);
-  }
-  return r + "]";
-}
-struct VarVisitor {
-  using type_t = decltype(var2str(0));
-  template <typename T>
-  type_t operator()(const T & v) {
-    return var2str(v);
-  }
-};
\ No newline at end of file