initial vulkan implement, porting from ncnn (#424)

9d3e4c88 · BUG1989 · GitHub · bf6d9617 · 9d3e4c88 · 9d3e4c88
213 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,7 @@ option(TENGINE_ARCH_ARM_82 "build armv8.2 for arm" OFF)

 # some plugin options
 option(TENGINE_ENABLE_ACL "Build with Arm Compute Library(ACL) support" OFF)
+option(TENGINE_ENABLE_VULKAN "Build with Vulkan GPU compute support" OFF)

 # add_definitions(-DCONFIG_DISABLE_PARAM_ACCESS)
 # add_definitions(-DCONFIG_INTERN_ALLOCATOR)

--- a/cmake/generate_shader_spv_header.cmake
+++ b/cmake/generate_shader_spv_header.cmake
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,10 +12,17 @@ macro (tengine_example name file)
    install (TARGETS ${name} DESTINATION bin)
 endmacro()

-# add examples
+# add c++ api examples
+if (TENGINE_BUILD_CPP_API)
+    tengine_example(cpp_tm_classification       cpp_tm_classification.cpp)
+    tengine_example(cpp_tm_mobilenet_ssd        cpp_tm_mobilenet_ssd.cpp)
+endif()
+
+# add c api examples
 tengine_example(tm_classification           tm_classification.c)
 tengine_example(tm_classification_fp16      tm_classification_fp16.c)
 tengine_example(tm_classification_uint8     tm_classification_uint8.c)
+tengine_example(tm_classification_vulkan    tm_classification_vulkan.c)
 tengine_example(tm_mobilenet_ssd            tm_mobilenet_ssd.c)
 tengine_example(tm_mobilenet_ssd_uint8      tm_mobilenet_ssd_uint8.cpp)
 tengine_example(tm_retinaface               tm_retinaface.cpp)
@@ -40,6 +47,7 @@ if (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
                    "${CMAKE_CURRENT_SOURCE_DIR}/${file}"
                    "${CMAKE_CURRENT_SOURCE_DIR}/common/tengine_operations.c")
            target_link_libraries(${name} ${CMAKE_PROJECT_NAME} ${OpenCV_LIBS})
+            install (TARGETS ${name} DESTINATION bin)
        endmacro()
        tengine_example_cv(tm_openpose      tm_openpose.cpp)
        tengine_example_cv(tm_yolact        tm_yolact.cpp)

--- a/examples/tm_classification.c
+++ b/examples/tm_classification.c
@@ -47,7 +47,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
    /* set runtime options */
    struct options opt;
    opt.num_thread = num_thread;
-    opt.cluster = TENGINE_CLUSTER_LITTLE;
+    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;

    /* inital tengine */
@@ -67,7 +67,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
        return -1;
    }

-    /* set the input shape to initial the graph, and prerun graph to infer shape */
+    /* set the shape, data buffer of input_tensor of the graph */
    int img_size = img_h * img_w * 3;
    int dims[] = {1, 3, img_h, img_w};    // nchw
    float* input_data = ( float* )malloc(img_size * sizeof(float));
@@ -85,6 +85,13 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
        return -1;
    }

+    if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }    
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
    if (prerun_graph_multithread(graph, opt) < 0)
    {
        fprintf(stderr, "Prerun multithread graph failed.\n");
@@ -93,11 +100,6 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,

    /* prepare process input data, set the data mem to input tensor */
    get_input_data(image_file, input_data, img_h, img_w, mean, scale);
-    if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0)
-    {
-        fprintf(stderr, "Set input tensor buffer failed\n");
-        return -1;
-    }

    /* run graph */
    double min_time = __DBL_MAX__;
@@ -137,8 +139,6 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,

    /* release tengine */
    free(input_data);
-    release_graph_tensor(input_tensor);
-    release_graph_tensor(output_tensor);
    postrun_graph(graph);
    destroy_graph(graph);
    release_tengine();

--- a/examples/tm_classification_vulkan.c
+++ b/examples/tm_classification_vulkan.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "common.h"
+#include "tengine_c_api.h"
+#include "tengine_operations.h"
+
+#define DEFAULT_IMG_H 227
+#define DEFAULT_IMG_W 227
+#define DEFAULT_SCALE1 1.f
+#define DEFAULT_SCALE2 1.f
+#define DEFAULT_SCALE3 1.f
+#define DEFAULT_MEAN1 104.007
+#define DEFAULT_MEAN2 116.669
+#define DEFAULT_MEAN3 122.679
+#define DEFAULT_LOOP_COUNT 1
+#define DEFAULT_THREAD_COUNT 1
+
+int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
+                     const float* scale, int loop_count, int num_thread)
+{
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = num_thread;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+
+    /* inital tengine */
+    if (init_tengine() != 0)
+    {
+        fprintf(stderr, "Initial tengine failed.\n");
+        return -1;
+    }
+    fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
+
+    /* create graph, load tengine model xxx.tmfile */
+    context_t vk_context = create_context("VK", 1);
+    add_context_device(vk_context, "VK");
+    graph_t graph = create_graph(vk_context, "tengine", model_file);
+    set_graph_device(graph, "VK");
+
+    if (NULL == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        fprintf(stderr, "errno: %d \n", get_tengine_errno());
+        return -1;
+    }
+
+    /* set the input shape to initial the graph, and prerun graph to infer shape */
+    int img_size = img_h * img_w * 3;
+    int dims[] = {1, 3, img_h, img_w};    // nchw
+    float* input_data = ( float* )malloc(img_size * sizeof(float));
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == NULL)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_input_data(image_file, input_data, img_h, img_w, mean, scale);
+    if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* run graph */
+    double min_time = __DBL_MAX__;
+    double max_time = -__DBL_MAX__;
+    double total_time = 0.;
+    for (int i = 0; i < loop_count; i++)
+    {
+        double start = get_current_time();
+        if (run_graph(graph, 1) < 0)
+        {
+            fprintf(stderr, "Run graph failed\n");
+            return -1;
+        }
+        double end = get_current_time();
+        double cur = end - start;
+        total_time += cur;
+        if (min_time > cur)
+            min_time = cur;
+        if (max_time < cur)
+            max_time = cur;
+    }
+    fprintf(stderr, "\nmodel file : %s\n", model_file);
+    fprintf(stderr, "image file : %s\n", image_file);
+    fprintf(stderr, "img_h, img_w, scale[3], mean[3] : %d %d , %.3f %.3f %.3f, %.1f %.1f %.1f\n", img_h, img_w,
+            scale[0], scale[1], scale[2], mean[0], mean[1], mean[2]);
+    fprintf(stderr, "Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", loop_count,
+            num_thread, total_time / loop_count, max_time, min_time);
+    fprintf(stderr, "--------------------------------------\n");
+
+    /* get the result of classification */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = ( float* )get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    print_topk(output_data, output_size, 5);
+    fprintf(stderr, "--------------------------------------\n");
+
+    /* release tengine */
+    free(input_data);
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return 0;
+}
+
+void show_usage()
+{
+    fprintf(
+        stderr,
+        "[Usage]:  [-h]\n    [-m model_file] [-i image_file]\n [-g img_h,img_w] [-s scale[0],scale[1],scale[2]] [-w "
+        "mean[0],mean[1],mean[2]] [-r loop_count] [-t thread_count]\n");
+    fprintf(
+        stderr,
+        "\nmobilenet example: \n    ./classification -m /path/to/mobilenet.tmfile -i /path/to/img.jpg -g 224,224 -s "
+        "0.017,0.017,0.017 -w 104.007,116.669,122.679\n");
+}
+
+int main(int argc, char* argv[])
+{
+    int loop_count = DEFAULT_LOOP_COUNT;
+    int num_thread = DEFAULT_THREAD_COUNT;
+    char* model_file = NULL;
+    char* image_file = NULL;
+    float img_hw[2] = {0.f};
+    int img_h = 0;
+    int img_w = 0;
+    float mean[3] = {-1.f, -1.f, -1.f};
+    float scale[3] = {0.f, 0.f, 0.f};
+
+    int res;
+    while ((res = getopt(argc, argv, "m:i:l:g:s:w:r:t:h")) != -1)
+    {
+        switch (res)
+        {
+            case 'm':
+                model_file = optarg;
+                break;
+            case 'i':
+                image_file = optarg;
+                break;
+            case 'g':
+                split(img_hw, optarg, ",");
+                img_h = ( int )img_hw[0];
+                img_w = ( int )img_hw[1];
+                break;
+            case 's':
+                split(scale, optarg, ",");
+                break;
+            case 'w':
+                split(mean, optarg, ",");
+                break;
+            case 'r':
+                loop_count = atoi(optarg);
+                break;
+            case 't':
+                num_thread = atoi(optarg);
+                break;
+            case 'h':
+                show_usage();
+                return 0;
+            default:
+                break;
+        }
+    }
+
+    /* check files */
+    if (model_file == NULL)
+    {
+        fprintf(stderr, "Error: Tengine model file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (image_file == NULL)
+    {
+        fprintf(stderr, "Error: Image file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (!check_file_exist(model_file) || !check_file_exist(image_file))
+        return -1;
+
+    if (img_h == 0)
+    {
+        img_h = DEFAULT_IMG_H;
+        fprintf(stderr, "Image height not specified, use default %d\n", img_h);
+    }
+
+    if (img_w == 0)
+    {
+        img_w = DEFAULT_IMG_W;
+        fprintf(stderr, "Image width not specified, use default  %d\n", img_w);
+    }
+
+    if (scale[0] == 0.f || scale[1] == 0.f || scale[2] == 0.f)
+    {
+        scale[0] = DEFAULT_SCALE1;
+        scale[1] = DEFAULT_SCALE2;
+        scale[2] = DEFAULT_SCALE3;
+        fprintf(stderr, "Scale value not specified, use default  %.1f, %.1f, %.1f\n", scale[0], scale[1], scale[2]);
+    }
+
+    if (mean[0] == -1.0 || mean[1] == -1.0 || mean[2] == -1.0)
+    {
+        mean[0] = DEFAULT_MEAN1;
+        mean[1] = DEFAULT_MEAN2;
+        mean[2] = DEFAULT_MEAN3;
+        fprintf(stderr, "Mean value not specified, use default   %.1f, %.1f, %.1f\n", mean[0], mean[1], mean[2]);
+    }
+
+    if (tengine_classify(model_file, image_file, img_h, img_w, mean, scale, loop_count, num_thread) < 0)
+        return -1;
+
+    return 0;
+}
--- a/include/tengine_op_name.h
+++ b/include/tengine_op_name.h
@@ -42,7 +42,7 @@
 #define OP_CONV_NAME "Convolution"
 #define OP_CONST_NAME "Const"
 #define OP_CROP_NAME "Crop"
-#define OP_DECONV_NAME "DeConv"
+#define OP_DECONV_NAME "Deconvolution"
 #define OP_DEPTHTOSPACE_NAME "Depthtospace"
 #define OP_DETECTION_OUTPUT_NAME "DetectionOutput"
 #define OP_DETECTION_POSTPROCESS_NAME "DetectionPostProcess"
@@ -100,9 +100,9 @@
 #define OP_SOFTMAX_NAME "Softmax"
 #define OP_SPACETOBATCHND_NAME "Spacetobatchnd"
 #define OP_SPACETODEPTH_NAME "Spacetodepth"
-#define OP_SPARSETODENSE_NAME "Sparsetodense"
+#define OP_SPARSETODENSE_NAME "SparseToDense"
 #define OP_SPLIT_NAME "Split"
-#define OP_SQUAREDDIFFERENCE_NAME "Squareddifference"
+#define OP_SQUAREDDIFFERENCE_NAME "SquaredDifference"
 #define OP_SQUEEZE_NAME "Squeeze"
 #define OP_STRIDEDSLICE_NAME "StridedSlice"
 #define OP_SWAP_AXIS_NAME "SwapAxis"

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -83,8 +83,40 @@ if (TENGINE_ENABLE_ACL)
 endif ()


+# add vulkan support
+if (TENGINE_ENABLE_VULKAN)
+    add_subdirectory(dev/vulkan)
+    list(APPEND TENGINE_PRIVATE_INC_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/dev/vulkan)
+    list(APPEND TENGINE_PRIVATE_INC_DIRS ${CMAKE_CURRENT_BINARY_DIR}/dev/vulkan)
+    list(APPEND TENGINE_PRIVATE_INC_DIRS ${Vulkan_INCLUDE_DIR})
+    list(APPEND TENGINE_VULKAN_LIB_DIRS $ENV{VULKAN_SDK}/lib)
+    
+    link_directories(${TENGINE_VULKAN_LIB_DIRS})
+    if(TENGINE_VERBOSE)
+        message (STATUS "TENGINE: 'TENGINE_VULKAN_LIB_DIRS' is ${TENGINE_VULKAN_LIB_DIRS}.")
+    endif()
+endif()
+
+
+# add VeriSilicon VIP8000 NPU support
+if (TENGINE_ENABLE_VIPNPU)
+    list(APPEND TENGINE_PRIVATE_INC_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/dev/vxnpu)
+    list(APPEND TENGINE_PRIVATE_INC_DIRS ${CMAKE_SOURCE_DIR}/3rdparty/vsi_sdk/include)
+    list(APPEND TENGINE_PRIVATE_INC_DIRS ${CMAKE_SOURCE_DIR}/3rdparty/acuity-ovxlib-dev/include)
+    list(APPEND TENGINE_VIPNPU_LIB_DIRS ${CMAKE_SOURCE_DIR}/3rdparty/acuity-ovxlib-dev/lib)
+
+    link_directories(${TENGINE_VIPNPU_LIB_DIRS})
+    if(TENGINE_VERBOSE)
+        message (STATUS "TENGINE: 'TENGINE_VIPNPU_LIB_DIRS' is ${TENGINE_VIPNPU_LIB_DIRS}.")
+    endif()
+
+    file(GLOB_RECURSE TENGINE_BACKEND_VXNPU_BASE "${CMAKE_CURRENT_SOURCE_DIR}/dev/vxnpu/*.c")
+    file(GLOB_RECURSE TENGINE_BACKEND_VXNPU_OPS "${CMAKE_CURRENT_SOURCE_DIR}/dev/vxnpu/op/*.c")
+endif ()
+
 # add libraries path
 list(APPEND TENGINE_PRIVATE_LIB_DIRS ${TENGINE_ACL_LIB_DIRS})
+list(APPEND TENGINE_PRIVATE_LIB_DIRS ${TENGINE_VULKAN_LIB_DIRS})
 set(TENGINE_COMMON_LIB_DIRS ${TENGINE_PRIVATE_LIB_DIRS} CACHE INTERNAL "" FORCE)
 link_directories(${TENGINE_PRIVATE_LIB_DIRS})

@@ -98,7 +130,9 @@ if (${TENGINE_TARGET_PROCESSOR} MATCHES "ARM")
        ${TENGINE_BACKEND_REF_OPS}
        ${TENGINE_BACKEND_HCL_OPS}
        ${TENGINE_BACKEND_HCL_ASM_OPS}
-        ${TENGINE_BACKEND_ACL_BASE})
+        ${TENGINE_BACKEND_ACL_BASE}
+        ${TENGINE_BACKEND_VULKAN_BASE}
+        ${TENGINE_BACKEND_VULKAN_OPS})
 elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
    add_library(${CMAKE_PROJECT_NAME} SHARED
        ${TENGINE_LIB_SRCS} ${TENGINE_FRONT_END_SRCS}
@@ -106,7 +140,9 @@ elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
        ${TENGINE_TINY_SERIALIZER_SRCS}
        ${TENGINE_BACKEND_COMMON}
        ${TENGINE_BACKEND_REF_OPS}
-        ${TENGINE_BACKEND_HCL_OPS})
+        ${TENGINE_BACKEND_HCL_OPS}
+        ${TENGINE_BACKEND_VULKAN_BASE}
+        ${TENGINE_BACKEND_VULKAN_OPS})
 else()
    add_library(${CMAKE_PROJECT_NAME} SHARED
        ${TENGINE_LIB_SRCS}
@@ -123,7 +159,6 @@ if(TENGINE_VERBOSE)
    message (STATUS "TENGINE: 'TENGINE_LINKING_LIBS' is ${TENGINE_LINKING_LIBS}.")
 endif()

-
 # add include path
 target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${TENGINE_PRIVATE_INC_DIRS})

@@ -136,7 +171,12 @@ endif()

 if (TENGINE_ENABLE_ACL)
    target_link_libraries(${CMAKE_PROJECT_NAME} arm_compute arm_compute_core)
-    message("${CMAKE_PROJECT_NAME}")
+endif ()
+
+if (TENGINE_ENABLE_VULKAN)
+    target_link_libraries(${CMAKE_PROJECT_NAME} ${Vulkan_LIBRARY})
+    message("===== vulkan library === ${Vulkan_LIBRARY}")
+    add_dependencies(${CMAKE_PROJECT_NAME} generate-spirv)
 endif ()
 install (TARGETS ${CMAKE_PROJECT_NAME} DESTINATION lib)
 install (FILES ${CMAKE_CURRENT_SOURCE_DIR}/../include/tengine_c_api.h DESTINATION include)
--- a/src/dev/vulkan/CMakeLists.txt
+++ b/src/dev/vulkan/CMakeLists.txt
+find_package(Vulkan REQUIRED)
+
+# TODO: move to check.cmake
+find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH REQUIRED)
+message(STATUS "Tengine: found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}")
+
+# add shader spv header generate macro
+include(${CMAKE_SOURCE_DIR}/cmake/generate_shader_spv_header.cmake)
+
+macro(add_shader SHADER_SRC)
+    message(STATUS "SHADER_SRC: ${SHADER_SRC}")
+    generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC})
+
+
+    get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME)
+    string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n")
+
+    get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n")
+
+    list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER})
+    list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS})
+
+    # generate layer_shader_type_enum file
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+
+endmacro()
+
+macro(add_layer class)
+    string(TOLOWER ${class} name)
+
+    file(GLOB_RECURSE SHADER_SRCS "shaders/${name}.comp")
+    file(GLOB_RECURSE SHADER_SUBSRCS "shaders/${name}_*.comp")
+    list(APPEND SHADER_SRCS ${SHADER_SUBSRCS})
+    foreach(SHADER_SRC ${SHADER_SRCS})
+        add_shader(${SHADER_SRC})
+    endforeach()
+
+    # generate layer_type_enum file
+    set(layer_type_enum "${layer_type_enum}${class} = ${__LAYER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_TYPE_ENUM_INDEX "${__LAYER_TYPE_ENUM_INDEX}+1")
+endmacro()
+
+set(SHADER_SPV_HEX_FILES)
+
+set(__LAYER_TYPE_ENUM_INDEX 0)
+set(__LAYER_SHADER_TYPE_ENUM_INDEX 0)
+
+add_layer(Convolution)
+add_layer(ConvolutionDepthWise)
+add_layer(Pooling)
+add_layer(Padding)
+add_layer(Packing)
+add_layer(InnerProduct)
+add_layer(Flatten)
+add_layer(Relu)
+add_layer(Eltwise)
+add_layer(Softmax)
+add_layer(Dropout)
+add_layer(PriorBox)
+add_layer(Permute)
+add_layer(Reshape)
+add_layer(Concat)
+add_layer(Interp)
+add_layer(Crop)
+
+add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES})
+
+# create new registry file
+configure_file(layer_shader_registry.h.in   ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_registry.h)
+configure_file(layer_shader_spv_data.h.in   ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_spv_data.h)
+configure_file(layer_type_enum.h.in         ${CMAKE_CURRENT_BINARY_DIR}/layer_type_enum.h)
+configure_file(layer_shader_type_enum.h.in  ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_type_enum.h)
+
+# message(STATUS "Tengine: add vulkan layer ${SHADER_SPV_HEX_FILES}")
+
+
+set(CMAKE_SHARED_LINKER_FLAGS "-Bsymbolic -Bsymbolic-functions")
+
+file(GLOB TENGINE_BACKEND_VULKAN_BASE "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
+file(GLOB TENGINE_BACKEND_VULKAN_OPS "${CMAKE_CURRENT_SOURCE_DIR}/layer/*.cpp")
+set(TENGINE_BACKEND_VULKAN_BASE ${TENGINE_BACKEND_VULKAN_BASE} CACHE INTERNAL " " FORCE)
+set(TENGINE_BACKEND_VULKAN_OPS ${TENGINE_BACKEND_VULKAN_OPS} CACHE INTERNAL " " FORCE)
--- a/src/dev/vulkan/layer/concat_vulkan.cpp
+++ b/src/dev/vulkan/layer/concat_vulkan.cpp
--- a/src/dev/vulkan/layer/concat_vulkan.hpp
+++ b/src/dev/vulkan/layer/concat_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CONCAT_HPP
+#define LAYER_CONCAT_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "concat_param.h"
+
+namespace TEngine{
+
+class Concat_vulkan : public Layer
+{
+public:
+    Concat_vulkan();
+    Concat_vulkan(ir_graph* ir_graph, ir_node* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_concat[2];
+    Pipeline* pipeline_concat_pack4[2];
+    Pipeline* pipeline_concat_pack4to1[2];
+    Pipeline* pipeline_concat_pack8[2];
+    Pipeline* pipeline_concat_pack8to4[2];
+    Pipeline* pipeline_concat_pack8to1[2];
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    int axis;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
--- a/src/dev/vulkan/layer/convolution_vulkan.cpp
+++ b/src/dev/vulkan/layer/convolution_vulkan.cpp
--- a/src/dev/vulkan/layer/convolution_vulkan.hpp
+++ b/src/dev/vulkan/layer/convolution_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CONVOLUTION_HPP
+#define LAYER_CONVOLUTION_HPP
+
+#include "padding_vulkan.hpp"
+#include "innerproduct_vulkan.hpp"
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "convolution_param.h"
+
+namespace TEngine {
+
+class Convolution_vulkan : public Layer
+{
+public:
+    Convolution_vulkan();
+    // Convolution_vulkan(ir_node* node);
+    Convolution_vulkan(ir_graph* graph, ir_node* node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    // virtual int record_pipeline(VkCompute& cmd, const Option& opt) const;
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+
+public:
+    int group;
+    int input_c;
+    int input_h;
+    int input_w;
+    int pad_w0;  // left padding columns
+    int pad_w1;  // right padding columns
+    int pad_h0;  // top padding rows
+    int pad_h1;  // bottom padding rows
+    int stride_h;
+    int stride_w;
+    int dilation_h;
+    int dilation_w;
+    int kernel_h;
+    int kernel_w;
+    int activation;
+    int output_c;
+    int output_h;
+    int output_w;
+
+    int weight_data_size;
+
+public:
+    Padding_vulkan* padding;
+    InnerProduct_vulkan* innerproduct;
+
+    VkTensor weight_data_gpu;
+    VkImageTensor weight_data_gpu_image;
+    VkTensor bias_data_gpu;
+
+    Pipeline* pipeline_convolution;
+    Pipeline* pipeline_convolution_pack4;
+    Pipeline* pipeline_convolution_pack8;
+    Pipeline* pipeline_convolution_pack1to4;
+    Pipeline* pipeline_convolution_pack4to1;
+    Pipeline* pipeline_convolution_pack1to8;
+    Pipeline* pipeline_convolution_pack4to8;
+    Pipeline* pipeline_convolution_pack8to1;
+    Pipeline* pipeline_convolution_pack8to4;
+
+    Pipeline* pipeline_convolution_1x1s1d1;
+    Pipeline* pipeline_convolution_pack4_1x1s1d1;
+    Pipeline* pipeline_convolution_pack8_1x1s1d1;
+};
+
+} // namespace TEngine
+
+
+#endif
--- a/src/dev/vulkan/layer/convolutiondepthwise_vulkan.cpp
+++ b/src/dev/vulkan/layer/convolutiondepthwise_vulkan.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "convolutiondepthwise_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+    ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
+    {
+        support_vulkan = true;
+        pipeline_convolutiondepthwise = 0;
+    }
+
+    ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph* ir_graph, ir_node* ir_node)
+    {
+        support_vulkan = true;
+
+        padding = 0;
+
+        pipeline_convolutiondepthwise = 0;
+        pipeline_convolutiondepthwise_pack4 = 0;
+        pipeline_convolutiondepthwise_pack8 = 0;
+        graph = ir_graph;
+        node = ir_node;
+
+        struct ir_tensor *input = get_ir_graph_tensor(graph, ir_node->input_tensors[0]);
+        std::string name = input->name;
+        bottoms.push_back(name);
+
+        struct ir_tensor *output = get_ir_graph_tensor(graph, ir_node->output_tensors[0]);
+        name = output->name;
+        tops.push_back(name);
+
+        struct conv_param *param = (struct conv_param *)ir_node->op.param_mem;
+
+        group = param->group;
+        input_c = input->dims[1];   // param->input_channel;
+        input_h = input->dims[2];
+        input_w = input->dims[3];
+        pad_w0 = param->pad_w0;    // left padding columns
+        pad_w1 = param->pad_w1;    // right padding columns
+        pad_h0 = param->pad_h0;    // top padding rows
+        pad_h1 = param->pad_h1;    // bottom padding rows
+        stride_w = param->stride_w;
+        stride_h = param->stride_h;
+        dilation_w = param->dilation_w;
+        dilation_h = param->dilation_h;
+        kernel_w = param->kernel_w;
+        kernel_h = param->kernel_h;
+        output_c = output->dims[1];  // param->output_channel;
+        output_h = output->dims[2];
+        output_w = output->dims[3];
+    }
+
+int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+
+    {
+        padding = new Padding_vulkan();
+        padding->vkdev = vkdev;
+
+        padding->top = pad_h0;
+        padding->bottom = pad_h1;
+        padding->left = pad_w0;
+        padding->right = pad_w1;
+        padding->type = 0;
+        padding->value = 0;
+
+        padding->input_w = input_w;
+        padding->input_h = input_h;
+        padding->input_c = input_c;
+        padding->output_w = input_w + pad_w0 + pad_w1;
+        padding->output_h = input_h + pad_h0 + pad_h1;
+        padding->output_c = input_c;
+
+        padding->create_pipeline(opt);
+    }
+
+
+    // const int maxk = kernel_w * kernel_h;
+    int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group;
+    int num_output = output_c;
+
+    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    std::vector<vk_specialization_type> specializations(11 + 10);
+    specializations[0].i = kernel_w;	// kernel_w;
+    specializations[1].i = kernel_h;	// kernel_h
+    specializations[2].i = dilation_w;	// dilation_w;
+    specializations[3].i = dilation_h;	// dilation_h;
+    specializations[4].i = stride_w;	// stride_w;
+    specializations[5].i = stride_h;	// stride_h;
+    specializations[6].i = node->input_num >2 ? 1 : 0; // bias_term;
+    specializations[7].i = group;
+    specializations[8].i = 1;//param->activation;	// activation_type;
+    specializations[9].f = 0;//param->activation;	// activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[10].f = 0;//param->activation; 	// activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[11 + 0].i = 0;  // 3;	// shape_bordered_packed.dims;
+    specializations[11 + 1].i = 0;  // input_w + pad_w0 + pad_w1;	// shape_bordered_packed.w;
+    specializations[11 + 2].i = 0;  // input_h + pad_h0 + pad_h1;	// shape_bordered_packed.h;
+    specializations[11 + 3].i = 0;  // input_c;	// shape_bordered_packed.c;
+    specializations[11 + 4].i = 0;  // (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);	// shape_bordered_packed.cstep;
+    specializations[11 + 5].i = 0;  // 3;	// out_shape_packed.dims;
+    specializations[11 + 6].i = 0;  // output_w;	// out_shape_packed.w;
+    specializations[11 + 7].i = 0;  // output_h;	// out_shape_packed.h;
+    specializations[11 + 8].i = 0;  // output_c;	// out_shape_packed.c;
+    specializations[11 + 9].i = 0;  // output_w * output_h;	// out_shape_packed.cstep;
+
+    VkTensor local_size_xyz;
+    local_size_xyz.w = std::min(4, output_w);
+    local_size_xyz.h = std::min(4, output_h);
+    local_size_xyz.c = std::min(4, output_c);
+
+    // pack1
+    if (elempack == 1)
+    {
+        pipeline_convolutiondepthwise = new Pipeline(vkdev);
+        pipeline_convolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolutiondepthwise->create(LayerShaderType::convolutiondepthwise, opt, specializations);
+    }
+
+    // pack4
+    if (elempack == 4)
+    {
+        pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev);
+        pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolutiondepthwise_pack4->create(LayerShaderType::convolutiondepthwise_pack4, opt, specializations);
+    }
+
+    // pack8
+    if (elempack == 8)
+    {
+        pipeline_convolutiondepthwise_pack8 = new Pipeline(vkdev);
+        pipeline_convolutiondepthwise_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
+{
+    if (padding)
+    {
+        padding->destroy_pipeline(opt);
+        delete padding;
+        padding = 0;
+    }
+
+    delete pipeline_convolutiondepthwise;
+    pipeline_convolutiondepthwise = 0;
+
+    delete pipeline_convolutiondepthwise_pack4;
+    pipeline_convolutiondepthwise_pack4 = 0;
+
+    delete pipeline_convolutiondepthwise_pack8;
+    pipeline_convolutiondepthwise_pack8 = 0;
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+        // upload kernel data
+    const int maxk = kernel_w * kernel_h;
+    int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group;
+    int num_output = output_c;
+
+    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+
+    ir_tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data);
+
+    Tensor weight_data_packed;
+    Tensor weight_data_r2 = weight_data.reshape(maxk, group);
+    TEngine::convert_packing(weight_data_r2, weight_data_packed, elempack);
+
+    cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
+
+    // upload bias data
+    if(node->input_num > 2)
+    {
+        ir_tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
+        Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data);
+        Tensor bias_data_packed;
+        convert_packing(bias_data, bias_data_packed, out_elempack);
+	    cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
+    }
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+
+    VkTensor bottom_blob_bordered = bottom_blob;
+    if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0)
+    {
+        // bottom_blob_bordered.w = bottom_blob_bordered.w + pad_w0 + pad_w1;
+        // bottom_blob_bordered.h = bottom_blob_bordered.h + pad_h0 + pad_h1;
+        // bottom_blob_bordered.cstep = bottom_blob_bordered.w * bottom_blob_bordered.h;
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+
+    top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator);
+
+    std::vector<VkTensor> bindings(4);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu;
+    bindings[3] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w);
+    const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
+                                   : elempack == 4 ? pipeline_convolutiondepthwise_pack4
+                                   : pipeline_convolutiondepthwise;
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+
+}
\ No newline at end of file
--- a/src/dev/vulkan/layer/convolutiondepthwise_vulkan.hpp
+++ b/src/dev/vulkan/layer/convolutiondepthwise_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CONVOLUTIONDEPTHWISE_HPP
+#define LAYER_CONVOLUTIONDEPTHWISE_HPP
+
+#include "padding_vulkan.hpp"
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "convolution_param.h"
+
+namespace TEngine {
+
+class ConvolutionDepthWise_vulkan : public Layer
+{
+public:
+    ConvolutionDepthWise_vulkan();
+    ConvolutionDepthWise_vulkan(ir_graph* ir_graph, ir_node* node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    int group;
+    int input_c;
+    int input_h;
+    int input_w;
+    int pad_w0;  // left padding columns
+    int pad_w1;  // right padding columns
+    int pad_h0;  // top padding rows
+    int pad_h1;  // bottom padding rows
+    int stride_h;
+    int stride_w;
+    int dilation_h;
+    int dilation_w;
+    int kernel_h;
+    int kernel_w;
+    int output_c;
+    int output_h;
+    int output_w;
+
+public:
+    Padding_vulkan* padding;
+
+    VkTensor weight_data_gpu;
+    VkTensor bias_data_gpu;
+
+    Pipeline* pipeline_convolutiondepthwise;
+    Pipeline* pipeline_convolutiondepthwise_pack4;
+    Pipeline* pipeline_convolutiondepthwise_pack8;
+};
+
+} // namespace TEngine
+
+
+#endif
--- a/src/dev/vulkan/layer/crop_vulkan.cpp
+++ b/src/dev/vulkan/layer/crop_vulkan.cpp
--- a/src/dev/vulkan/layer/crop_vulkan.hpp
+++ b/src/dev/vulkan/layer/crop_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CROP_HPP
+#define LAYER_CROP_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "crop_param.h"
+
+namespace TEngine{
+
+class Crop_vulkan : public Layer
+{
+public:
+    Crop_vulkan();
+    Crop_vulkan(ir_graph* ir_graph, ir_node* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    void resolve_crop_roi(const Tensor& bottom_blob, int& _woffset, int& _hoffset, int& _coffset, int& _outw, int& _outh, int& _outc) const;
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_crop;
+    Pipeline* pipeline_crop_pack4;
+    Pipeline* pipeline_crop_pack1to4;
+    Pipeline* pipeline_crop_pack4to1;
+    Pipeline* pipeline_crop_pack8;
+    Pipeline* pipeline_crop_pack1to8;
+    Pipeline* pipeline_crop_pack4to8;
+    Pipeline* pipeline_crop_pack8to4;
+    Pipeline* pipeline_crop_pack8to1;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    
+    int num_args;
+    int offset_c;
+    int offset_h;
+    int offset_w;
+    int crop_h;
+    int crop_w;
+    int center_crop;
+    int axis;
+    int flag;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
--- a/src/dev/vulkan/layer/dropout_vulkan.cpp
+++ b/src/dev/vulkan/layer/dropout_vulkan.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "dropout_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Dropout_vulkan::Dropout_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_dropout = 0;
+    pipeline_dropout_pack4 = 0;
+    pipeline_dropout_pack8 = 0;
+}
+
+Dropout_vulkan::Dropout_vulkan(ir_graph* ir_graph, ir_node* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_dropout = 0;
+    pipeline_dropout_pack4 = 0;
+    pipeline_dropout_pack8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct ir_tensor *input = get_ir_graph_tensor(graph, ir_node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct ir_tensor *output = get_ir_graph_tensor(graph, ir_node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+
+    if(input->scale != 0)
+        scale = input->scale;
+    else
+        scale = 1.0f;      
+}
+
+int Dropout_vulkan::create_pipeline(const Option& opt)
+{
+    const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    std::vector<vk_specialization_type> specializations(1 + 5);
+    specializations[0].f = scale;
+    specializations[1 + 0].i = shape_packed.dims;
+    specializations[1 + 1].i = shape_packed.w;
+    specializations[1 + 2].i = shape_packed.h;
+    specializations[1 + 3].i = shape_packed.c;
+    specializations[1 + 4].i = shape_packed.cstep;
+
+    Tensor local_size_xyz;
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, shape_packed.w);
+        local_size_xyz.h = std::min(8, shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_dropout = new Pipeline(vkdev);
+        pipeline_dropout->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_dropout->create(LayerShaderType::dropout, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_dropout_pack4 = new Pipeline(vkdev);
+        pipeline_dropout_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_dropout_pack4->create(LayerShaderType::dropout_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_dropout_pack8 = new Pipeline(vkdev);
+        pipeline_dropout_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_dropout_pack8->create(LayerShaderType::dropout_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Dropout_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_dropout;
+    pipeline_dropout = 0;
+
+    delete pipeline_dropout_pack4;
+    pipeline_dropout_pack4 = 0;
+
+    delete pipeline_dropout_pack8;
+    pipeline_dropout_pack8 = 0;
+
+    return 0;
+}
+
+int Dropout_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const
+{
+    if (scale == 1.f)
+    {
+        return 0;
+    }
+
+    int elempack = bottom_top_blob.elempack;
+
+    std::vector<VkTensor> bindings(1);
+    bindings[0] = bottom_top_blob;
+
+    std::vector<vk_constant_type> constants(5);
+    constants[0].i = bottom_top_blob.dims;
+    constants[1].i = bottom_top_blob.w;
+    constants[2].i = bottom_top_blob.h;
+    constants[3].i = bottom_top_blob.c;
+    constants[4].i = bottom_top_blob.cstep;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_dropout_pack8
+                               : elempack == 4 ? pipeline_dropout_pack4
+                               : pipeline_dropout;
+
+    cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+
+    return 0;
+}
+
+
+
+}   // namespace TEngine
\ No newline at end of file
--- a/src/dev/vulkan/layer/dropout_vulkan.hpp
+++ b/src/dev/vulkan/layer/dropout_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_DROPOUT_HPP
+#define LAYER_DROPOUT_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+namespace TEngine{
+
+class Dropout_vulkan : public Layer
+{
+public:
+    Dropout_vulkan();
+    Dropout_vulkan(ir_graph* ir_graph, ir_node* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    // virtual int upload_model(VkTransfer& cmd, const Option& opt);
+    
+    virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_dropout;
+    Pipeline* pipeline_dropout_pack4;
+    Pipeline* pipeline_dropout_pack8;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    float scale;
+
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
--- a/src/dev/vulkan/layer/eltwise_vulkan.cpp
+++ b/src/dev/vulkan/layer/eltwise_vulkan.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "eltwise_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Eltwise_vulkan::Eltwise_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_eltwise[0] = 0;
+    pipeline_eltwise[1] = 0;
+    pipeline_eltwise_pack4[0] = 0;
+    pipeline_eltwise_pack4[1] = 0;
+    pipeline_eltwise_pack8[0] = 0;
+    pipeline_eltwise_pack8[1] = 0;
+}
+
+Eltwise_vulkan::Eltwise_vulkan(ir_graph* ir_graph, ir_node* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_eltwise[0] = 0;
+    pipeline_eltwise[1] = 0;
+    pipeline_eltwise_pack4[0] = 0;
+    pipeline_eltwise_pack4[1] = 0;
+    pipeline_eltwise_pack8[0] = 0;
+    pipeline_eltwise_pack8[1] = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    for(int i = 0; i < ir_node->input_num; i++)
+    {
+        struct ir_tensor *input = get_ir_graph_tensor(graph, ir_node->input_tensors[i]);
+        std::string name = input->name;
+        bottoms.push_back(name);
+    }
+
+    for(int i = 0; i < ir_node->output_num; i++)
+    {
+        struct ir_tensor *output = get_ir_graph_tensor(graph, ir_node->output_tensors[i]);
+        std::string name = output->name;
+        tops.push_back(name);
+    }
+
+    struct eltwise_param *param = (struct eltwise_param *)ir_node->op.param_mem;
+    op_type = (param -> type) / 2;
+}
+
+int Eltwise_vulkan::create_pipeline(const Option& opt)
+{
+    const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    std::vector<vk_specialization_type> specializations(2 + 5);
+    specializations[0].i = op_type;
+    specializations[1].i = 0;   // coeffs.w == 0 ? 0 : 1;   TODO fix coeffs value
+    specializations[2 + 0].i = 0;   // shape_packed.dims;
+    specializations[2 + 1].i = 0;   // shape_packed.w;
+    specializations[2 + 2].i = 0;   // shape_packed.h;
+    specializations[2 + 3].i = 0;   // shape_packed.c;
+    specializations[2 + 4].i = 0;   // shape_packed.cstep;
+
+    Tensor local_size_xyz;
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, shape_packed.w);
+        local_size_xyz.h = std::min(8, shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_eltwise[0] = new Pipeline(vkdev);
+        pipeline_eltwise[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise[0]->create(LayerShaderType::eltwise, opt, specializations);
+        pipeline_eltwise[1] = new Pipeline(vkdev);
+        pipeline_eltwise[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise[1]->create(LayerShaderType::eltwise, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_eltwise_pack4[0] = new Pipeline(vkdev);
+        pipeline_eltwise_pack4[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack4[0]->create(LayerShaderType::eltwise_pack4, opt, specializations);
+        pipeline_eltwise_pack4[1] = new Pipeline(vkdev);
+        pipeline_eltwise_pack4[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack4[1]->create(LayerShaderType::eltwise_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_eltwise_pack8[0] = new Pipeline(vkdev);
+        pipeline_eltwise_pack8[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack8[0]->create(LayerShaderType::eltwise_pack8, opt, specializations);
+        pipeline_eltwise_pack8[1] = new Pipeline(vkdev);
+        pipeline_eltwise_pack8[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack8[1]->create(LayerShaderType::eltwise_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Eltwise_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_eltwise[0];
+    delete pipeline_eltwise[1];
+    pipeline_eltwise[0] = 0;
+    pipeline_eltwise[1] = 0;
+
+    delete pipeline_eltwise_pack4[0];
+    delete pipeline_eltwise_pack4[1];
+    pipeline_eltwise_pack4[0] = 0;
+    pipeline_eltwise_pack4[1] = 0;
+
+    delete pipeline_eltwise_pack8[0];
+    delete pipeline_eltwise_pack8[1];
+    pipeline_eltwise_pack8[0] = 0;
+    pipeline_eltwise_pack8[1] = 0;
+
+    return 0;
+}
+
+int Eltwise_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    const VkTensor& bottom_blob = bottom_blobs[0];
+    const VkTensor& bottom_blob1 = bottom_blobs[1];
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    VkTensor& top_blob = top_blobs[0];
+    top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(3);
+    bindings[0] = bottom_blob;
+    bindings[1] = bottom_blob1;
+    bindings[2] = top_blob;
+
+    std::vector<vk_constant_type> constants(5 + 2);
+    constants[0].i = top_blob.dims;
+    constants[1].i = top_blob.w;
+    constants[2].i = top_blob.h;
+    constants[3].i = top_blob.c;
+    constants[4].i = top_blob.cstep;
+    constants[5].f = 1.0f;  // coeffs.w == 0 ? 1.f : coeffs[0];     TODO fix coeffs value
+    constants[6].f = 1.0f;  // coeffs.w == 0 ? 1.f : coeffs[1];
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1]
+                               : elempack == 4 ? pipeline_eltwise_pack4[1]
+                               : pipeline_eltwise[1];
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    for (size_t b = 2; b < bottom_blobs.size(); b++)
+    {
+        std::vector<VkTensor> bindings(3);
+        bindings[0] = top_blob;
+        bindings[1] = bottom_blobs[b];
+        bindings[2] = top_blob; // TODO use separated pipeline ?
+
+        std::vector<vk_constant_type> constants(5 + 2);
+        constants[0].i = top_blob.dims;
+        constants[1].i = top_blob.w;
+        constants[2].i = top_blob.h;
+        constants[3].i = top_blob.c;
+        constants[4].i = top_blob.cstep;
+        constants[5].f = 1.f;
+        constants[6].f = 1.0f;  // coeffs.w == 0 ? 1 : coeffs[b];       TODO fixcoeffs value
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b % 2]
+                                   : elempack == 4 ? pipeline_eltwise_pack4[b % 2]
+                                   : pipeline_eltwise[b % 2];
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
--- a/src/dev/vulkan/layer/eltwise_vulkan.hpp
+++ b/src/dev/vulkan/layer/eltwise_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_ELTWISE_HPP
+#define LAYER_ELTWISE_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "eltwise_param.h"
+
+namespace TEngine{
+
+class Eltwise_vulkan : public Layer
+{
+public:
+    Eltwise_vulkan();
+    Eltwise_vulkan(ir_graph* ir_graph, ir_node* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_eltwise[2];
+    Pipeline* pipeline_eltwise_pack4[2];
+    Pipeline* pipeline_eltwise_pack8[2];
+
+public:
+    enum EltType
+    {
+        ELT_PROD,
+        ELT_PROD_SCALAR,
+        ELT_SUM,
+        ELT_SUM_SCALAR,
+        ELT_SUB,
+        ELT_SUB_SCALAR,
+        ELT_MAX,
+        ELT_RSQRT,
+        ELT_MIN_SCALAR,
+        ELT_LAST,
+        ELT_DIV,
+        ELT_LOG,
+        ELT_EXP,
+        ELT_SQRT,
+        ELT_FLOOR,
+        ELT_SQUARE,
+        ELT_POW
+    };
+    int op_type;    // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2
+
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
--- a/src/dev/vulkan/layer/flatten_vulkan.cpp
+++ b/src/dev/vulkan/layer/flatten_vulkan.cpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "flatten_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Flatten_vulkan::Flatten_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_flatten = 0;
+    pipeline_flatten_pack4 = 0;
+    pipeline_flatten_pack1to4 = 0;
+    pipeline_flatten_pack8 = 0;
+    pipeline_flatten_pack1to8 = 0;
+    pipeline_flatten_pack4to8 = 0;
+}
+
+Flatten_vulkan::Flatten_vulkan(ir_graph* ir_graph, ir_node* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_flatten = 0;
+    pipeline_flatten_pack4 = 0;
+    pipeline_flatten_pack1to4 = 0;
+    pipeline_flatten_pack8 = 0;
+    pipeline_flatten_pack1to8 = 0;
+    pipeline_flatten_pack4to8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct ir_tensor *input = get_ir_graph_tensor(graph, ir_node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct ir_tensor *output = get_ir_graph_tensor(graph, ir_node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+    output_size = output->dims[3]*output->dims[2]*output->dims[1];
+}
+
+int Flatten_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    // const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
+    const Tensor& out_shape = Tensor(output_size, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
+
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    int out_elempack = 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Tensor out_shape_packed;
+    if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    std::vector<vk_specialization_type> specializations(0 + 10);
+    specializations[0 + 0].i = 0;   // shape_packed.dims;
+    specializations[0 + 1].i = 0;   // shape_packed.w;
+    specializations[0 + 2].i = 0;   // shape_packed.h;
+    specializations[0 + 3].i = 0;   // shape_packed.c;
+    specializations[0 + 4].i = 0;   // shape_packed.cstep;
+    specializations[0 + 5].i = 0;   // out_shape_packed.dims;
+    specializations[0 + 6].i = 0;   // out_shape_packed.w;
+    specializations[0 + 7].i = 0;   // out_shape_packed.h;
+    specializations[0 + 8].i = 0;   // out_shape_packed.c;
+    specializations[0 + 9].i = 0;   // out_shape_packed.cstep;
+
+    Tensor local_size_xyz(64, 1, 1, (void*)0);
+    if (out_shape_packed.dims != 0)
+    {
+        local_size_xyz.w = std::min(64, out_shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+
+    // pack1
+    if (shape.dims == 0 || (elempack == 1 && out_elempack == 1))
+    {
+        pipeline_flatten = new Pipeline(vkdev);
+        pipeline_flatten->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten->create(LayerShaderType::flatten, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || (elempack == 4 && out_elempack == 4))
+    {
+        pipeline_flatten_pack4 = new Pipeline(vkdev);
+        pipeline_flatten_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack4->create(LayerShaderType::flatten_pack4, opt, specializations);
+    }
+
+    // pack1to4
+    if (shape.dims == 0 || (elempack == 1 && out_elempack == 4))
+    {
+        pipeline_flatten_pack1to4 = new Pipeline(vkdev);
+        pipeline_flatten_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack1to4->create(LayerShaderType::flatten_pack1to4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 8))
+    {
+        pipeline_flatten_pack8 = new Pipeline(vkdev);
+        pipeline_flatten_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack8->create(LayerShaderType::flatten_pack8, opt, specializations);
+    }
+
+    // pack1to8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 1 && out_elempack == 8))
+    {
+        pipeline_flatten_pack1to8 = new Pipeline(vkdev);
+        pipeline_flatten_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack1to8->create(LayerShaderType::flatten_pack1to8, opt, specializations);
+    }
+
+    // pack4to8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 4 && out_elempack == 8))
+    {
+        pipeline_flatten_pack4to8 = new Pipeline(vkdev);
+        pipeline_flatten_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack4to8->create(LayerShaderType::flatten_pack4to8, opt, specializations);
+    }
+
+    return 0;
+}
+
+
+
+int Flatten_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_flatten;
+    pipeline_flatten = 0;
+
+    delete pipeline_flatten_pack4;
+    pipeline_flatten_pack4 = 0;
+
+    delete pipeline_flatten_pack1to4;
+    pipeline_flatten_pack1to4 = 0;
+
+    delete pipeline_flatten_pack8;
+    pipeline_flatten_pack8 = 0;
+
+    delete pipeline_flatten_pack1to8;
+    pipeline_flatten_pack1to8 = 0;
+
+    delete pipeline_flatten_pack4to8;
+    pipeline_flatten_pack4to8 = 0;
+
+    return 0;
+}
+
+int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int total = w * h * channels * elempack;
+
+    int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1;
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (opt.use_fp16_packed && !opt.use_fp16_storage)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+
+    if (dims == 2 && elempack == 1 && !(opt.use_fp16_packed && !opt.use_fp16_storage && out_elempack != 1))
+    {
+        top_blob = bottom_blob;
+        top_blob.dims = 1;
+        top_blob.w = total / out_elempack;
+        top_blob.h = 1;
+        top_blob.cstep = top_blob.w;
+        top_blob.elemsize = out_elemsize;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(2);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = bottom_blob.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    const Pipeline* pipeline = 0;
+    if (elempack == 1 && out_elempack == 1)
+    {
+        pipeline = pipeline_flatten;
+    }
+    else if (elempack == 4 && out_elempack == 4)
+    {
+        pipeline = pipeline_flatten_pack4;
+    }
+    else if (elempack == 1 && out_elempack == 4)
+    {
+        pipeline = pipeline_flatten_pack1to4;
+    }
+    else if (elempack == 8 /*&& out_elempack == 8*/)
+    {
+        pipeline = pipeline_flatten_pack8;
+    }
+    else if (elempack == 1 && out_elempack == 8)
+    {
+        pipeline = pipeline_flatten_pack1to8;
+    }
+    else if (elempack == 4 && out_elempack == 8)
+    {
+        pipeline = pipeline_flatten_pack4to8;
+    }
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
--- a/src/dev/vulkan/layer/flatten_vulkan.hpp
+++ b/src/dev/vulkan/layer/flatten_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_FLATTEN_HPP
+#define LAYER_FLATTEN_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "flatten_param.h"
+
+namespace TEngine{
+
+class Flatten_vulkan : public Layer
+{
+public:
+    Flatten_vulkan();
+    Flatten_vulkan(ir_graph* ir_graph, ir_node* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_flatten;
+    Pipeline* pipeline_flatten_pack4;
+    Pipeline* pipeline_flatten_pack1to4;
+    Pipeline* pipeline_flatten_pack8;
+    Pipeline* pipeline_flatten_pack1to8;
+    Pipeline* pipeline_flatten_pack4to8;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    int output_size;
+
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
--- a/src/dev/vulkan/layer/innerproduct_vulkan.cpp
+++ b/src/dev/vulkan/layer/innerproduct_vulkan.cpp
--- a/src/dev/vulkan/layer/innerproduct_vulkan.hpp
+++ b/src/dev/vulkan/layer/innerproduct_vulkan.hpp
--- a/src/dev/vulkan/layer/interp_vulkan.cpp
+++ b/src/dev/vulkan/layer/interp_vulkan.cpp
--- a/src/dev/vulkan/layer/interp_vulkan.hpp
+++ b/src/dev/vulkan/layer/interp_vulkan.hpp
--- a/src/dev/vulkan/layer/packing_vulkan.cpp
+++ b/src/dev/vulkan/layer/packing_vulkan.cpp
--- a/src/dev/vulkan/layer/packing_vulkan.hpp
+++ b/src/dev/vulkan/layer/packing_vulkan.hpp
--- a/src/dev/vulkan/layer/padding_vulkan.cpp
+++ b/src/dev/vulkan/layer/padding_vulkan.cpp
--- a/src/dev/vulkan/layer/padding_vulkan.hpp
+++ b/src/dev/vulkan/layer/padding_vulkan.hpp
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_PADDING_HPP
+#define LAYER_PADDING_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+namespace TEngine {
+
+class Padding_vulkan : public Layer
+{
+public:
+    Padding_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    int top;
+    int bottom;
+    int left;
+    int right;
+    int type;// 0=CONSTANT 1=REPLICATE 2=REFLECT
+    float value;
+    int input_w;
+    int input_h;
+    int input_c;
+    int output_w;
+    int output_h;
+    int output_c;
+
+public:
+    Pipeline* pipeline_padding;
+    Pipeline* pipeline_padding_pack4;
+    Pipeline* pipeline_padding_pack8;
+};
+
+} // namespace TEngine
+
+
+#endif
--- a/src/dev/vulkan/layer/permute_vulkan.cpp
+++ b/src/dev/vulkan/layer/permute_vulkan.cpp
--- a/src/dev/vulkan/layer/permute_vulkan.hpp
+++ b/src/dev/vulkan/layer/permute_vulkan.hpp
--- a/src/dev/vulkan/layer/pooling_vulkan.cpp
+++ b/src/dev/vulkan/layer/pooling_vulkan.cpp
--- a/src/dev/vulkan/layer/pooling_vulkan.hpp
+++ b/src/dev/vulkan/layer/pooling_vulkan.hpp
--- a/src/dev/vulkan/layer/priorbox_vulkan.cpp
+++ b/src/dev/vulkan/layer/priorbox_vulkan.cpp
--- a/src/dev/vulkan/layer/priorbox_vulkan.hpp
+++ b/src/dev/vulkan/layer/priorbox_vulkan.hpp
--- a/src/dev/vulkan/layer/relu_vulkan.cpp
+++ b/src/dev/vulkan/layer/relu_vulkan.cpp
--- a/src/dev/vulkan/layer/relu_vulkan.hpp
+++ b/src/dev/vulkan/layer/relu_vulkan.hpp
--- a/src/dev/vulkan/layer/reshape_vulkan.cpp
+++ b/src/dev/vulkan/layer/reshape_vulkan.cpp
--- a/src/dev/vulkan/layer/reshape_vulkan.hpp
+++ b/src/dev/vulkan/layer/reshape_vulkan.hpp
--- a/src/dev/vulkan/layer/softmax_vulkan.cpp
+++ b/src/dev/vulkan/layer/softmax_vulkan.cpp
--- a/src/dev/vulkan/layer/softmax_vulkan.hpp
+++ b/src/dev/vulkan/layer/softmax_vulkan.hpp
--- a/src/dev/vulkan/layer_shader_registry.h.in
+++ b/src/dev/vulkan/layer_shader_registry.h.in
--- a/src/dev/vulkan/layer_shader_spv_data.h.in
+++ b/src/dev/vulkan/layer_shader_spv_data.h.in
--- a/src/dev/vulkan/layer_shader_type.h
+++ b/src/dev/vulkan/layer_shader_type.h
--- a/src/dev/vulkan/layer_shader_type_enum.h.in
+++ b/src/dev/vulkan/layer_shader_type_enum.h.in
--- a/src/dev/vulkan/layer_type_enum.h.in
+++ b/src/dev/vulkan/layer_type_enum.h.in
--- a/src/dev/vulkan/shaders/concat.comp
+++ b/src/dev/vulkan/shaders/concat.comp
--- a/src/dev/vulkan/shaders/concat_pack4.comp
+++ b/src/dev/vulkan/shaders/concat_pack4.comp
--- a/src/dev/vulkan/shaders/concat_pack4to1.comp
+++ b/src/dev/vulkan/shaders/concat_pack4to1.comp
--- a/src/dev/vulkan/shaders/concat_pack8.comp
+++ b/src/dev/vulkan/shaders/concat_pack8.comp
--- a/src/dev/vulkan/shaders/concat_pack8to1.comp
+++ b/src/dev/vulkan/shaders/concat_pack8to1.comp
--- a/src/dev/vulkan/shaders/concat_pack8to4.comp
+++ b/src/dev/vulkan/shaders/concat_pack8to4.comp
--- a/src/dev/vulkan/shaders/convolution.comp
+++ b/src/dev/vulkan/shaders/convolution.comp
--- a/src/dev/vulkan/shaders/convolution_1x1s1d1.comp
+++ b/src/dev/vulkan/shaders/convolution_1x1s1d1.comp
--- a/src/dev/vulkan/shaders/convolution_pack1to4.comp
+++ b/src/dev/vulkan/shaders/convolution_pack1to4.comp
--- a/src/dev/vulkan/shaders/convolution_pack1to8.comp
+++ b/src/dev/vulkan/shaders/convolution_pack1to8.comp
--- a/src/dev/vulkan/shaders/convolution_pack4.comp
+++ b/src/dev/vulkan/shaders/convolution_pack4.comp
--- a/src/dev/vulkan/shaders/convolution_pack4_1x1s1d1.comp
+++ b/src/dev/vulkan/shaders/convolution_pack4_1x1s1d1.comp
--- a/src/dev/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp
+++ b/src/dev/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp
--- a/src/dev/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
+++ b/src/dev/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
--- a/src/dev/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
+++ b/src/dev/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
--- a/src/dev/vulkan/shaders/convolution_pack4to1.comp
+++ b/src/dev/vulkan/shaders/convolution_pack4to1.comp
--- a/src/dev/vulkan/shaders/convolution_pack4to8.comp
+++ b/src/dev/vulkan/shaders/convolution_pack4to8.comp
--- a/src/dev/vulkan/shaders/convolution_pack8.comp
+++ b/src/dev/vulkan/shaders/convolution_pack8.comp
--- a/src/dev/vulkan/shaders/convolution_pack8_1x1s1d1.comp
+++ b/src/dev/vulkan/shaders/convolution_pack8_1x1s1d1.comp
--- a/src/dev/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp
+++ b/src/dev/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp
--- a/src/dev/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp
+++ b/src/dev/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp
--- a/src/dev/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
+++ b/src/dev/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
--- a/src/dev/vulkan/shaders/convolution_pack8to1.comp
+++ b/src/dev/vulkan/shaders/convolution_pack8to1.comp
--- a/src/dev/vulkan/shaders/convolution_pack8to4.comp
+++ b/src/dev/vulkan/shaders/convolution_pack8to4.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack4.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack4.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack8.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack8.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_pack4.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_pack4.comp
--- a/src/dev/vulkan/shaders/convolutiondepthwise_pack8.comp
+++ b/src/dev/vulkan/shaders/convolutiondepthwise_pack8.comp
--- a/src/dev/vulkan/shaders/crop.comp
+++ b/src/dev/vulkan/shaders/crop.comp
--- a/src/dev/vulkan/shaders/crop_pack1to4.comp
+++ b/src/dev/vulkan/shaders/crop_pack1to4.comp
--- a/src/dev/vulkan/shaders/crop_pack1to8.comp
+++ b/src/dev/vulkan/shaders/crop_pack1to8.comp
--- a/src/dev/vulkan/shaders/crop_pack4.comp
+++ b/src/dev/vulkan/shaders/crop_pack4.comp
--- a/src/dev/vulkan/shaders/crop_pack4to1.comp
+++ b/src/dev/vulkan/shaders/crop_pack4to1.comp
--- a/src/dev/vulkan/shaders/crop_pack4to8.comp
+++ b/src/dev/vulkan/shaders/crop_pack4to8.comp
--- a/src/dev/vulkan/shaders/crop_pack8.comp
+++ b/src/dev/vulkan/shaders/crop_pack8.comp
--- a/src/dev/vulkan/shaders/crop_pack8to1.comp
+++ b/src/dev/vulkan/shaders/crop_pack8to1.comp
--- a/src/dev/vulkan/shaders/crop_pack8to4.comp
+++ b/src/dev/vulkan/shaders/crop_pack8to4.comp
--- a/src/dev/vulkan/shaders/depthwiseconvolution.comp
+++ b/src/dev/vulkan/shaders/depthwiseconvolution.comp
--- a/src/dev/vulkan/shaders/dropout.comp
+++ b/src/dev/vulkan/shaders/dropout.comp
--- a/src/dev/vulkan/shaders/dropout_pack4.comp
+++ b/src/dev/vulkan/shaders/dropout_pack4.comp
--- a/src/dev/vulkan/shaders/dropout_pack8.comp
+++ b/src/dev/vulkan/shaders/dropout_pack8.comp
--- a/src/dev/vulkan/shaders/eltwise.comp
+++ b/src/dev/vulkan/shaders/eltwise.comp
--- a/src/dev/vulkan/shaders/eltwise_pack4.comp
+++ b/src/dev/vulkan/shaders/eltwise_pack4.comp
--- a/src/dev/vulkan/shaders/eltwise_pack8.comp
+++ b/src/dev/vulkan/shaders/eltwise_pack8.comp
--- a/src/dev/vulkan/shaders/flatten.comp
+++ b/src/dev/vulkan/shaders/flatten.comp
--- a/src/dev/vulkan/shaders/flatten_pack1to4.comp
+++ b/src/dev/vulkan/shaders/flatten_pack1to4.comp
--- a/src/dev/vulkan/shaders/flatten_pack1to8.comp
+++ b/src/dev/vulkan/shaders/flatten_pack1to8.comp
--- a/src/dev/vulkan/shaders/flatten_pack4.comp
+++ b/src/dev/vulkan/shaders/flatten_pack4.comp
--- a/src/dev/vulkan/shaders/flatten_pack4to8.comp
+++ b/src/dev/vulkan/shaders/flatten_pack4to8.comp
--- a/src/dev/vulkan/shaders/flatten_pack8.comp
+++ b/src/dev/vulkan/shaders/flatten_pack8.comp
--- a/src/dev/vulkan/shaders/innerproduct.comp
+++ b/src/dev/vulkan/shaders/innerproduct.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack1to4.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack1to4.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack1to8.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack1to8.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack4.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack4.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack4to1.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack4to1.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack4to8.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack4to8.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack8.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack8.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack8to1.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack8to1.comp
--- a/src/dev/vulkan/shaders/innerproduct_pack8to4.comp
+++ b/src/dev/vulkan/shaders/innerproduct_pack8to4.comp
--- a/src/dev/vulkan/shaders/interp.comp
+++ b/src/dev/vulkan/shaders/interp.comp
--- a/src/dev/vulkan/shaders/interp_bicubic.comp
+++ b/src/dev/vulkan/shaders/interp_bicubic.comp
--- a/src/dev/vulkan/shaders/interp_bicubic_coeffs.comp
+++ b/src/dev/vulkan/shaders/interp_bicubic_coeffs.comp
--- a/src/dev/vulkan/shaders/interp_bicubic_pack4.comp
+++ b/src/dev/vulkan/shaders/interp_bicubic_pack4.comp
--- a/src/dev/vulkan/shaders/interp_bicubic_pack8.comp
+++ b/src/dev/vulkan/shaders/interp_bicubic_pack8.comp
--- a/src/dev/vulkan/shaders/interp_pack4.comp
+++ b/src/dev/vulkan/shaders/interp_pack4.comp
--- a/src/dev/vulkan/shaders/interp_pack8.comp
+++ b/src/dev/vulkan/shaders/interp_pack8.comp
--- a/src/dev/vulkan/shaders/packing.comp
+++ b/src/dev/vulkan/shaders/packing.comp
--- a/src/dev/vulkan/shaders/packing_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack1to4.comp
+++ b/src/dev/vulkan/shaders/packing_pack1to4.comp
--- a/src/dev/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack1to8.comp
+++ b/src/dev/vulkan/shaders/packing_pack1to8.comp
--- a/src/dev/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack4.comp
+++ b/src/dev/vulkan/shaders/packing_pack4.comp
--- a/src/dev/vulkan/shaders/packing_pack4_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack4_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack4_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack4_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack4to1.comp
+++ b/src/dev/vulkan/shaders/packing_pack4to1.comp
--- a/src/dev/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack4to8.comp
+++ b/src/dev/vulkan/shaders/packing_pack4to8.comp
--- a/src/dev/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack8.comp
+++ b/src/dev/vulkan/shaders/packing_pack8.comp
--- a/src/dev/vulkan/shaders/packing_pack8_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack8_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack8_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack8_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack8to1.comp
+++ b/src/dev/vulkan/shaders/packing_pack8to1.comp
--- a/src/dev/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/packing_pack8to4.comp
+++ b/src/dev/vulkan/shaders/packing_pack8to4.comp
--- a/src/dev/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp
+++ b/src/dev/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp
--- a/src/dev/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp
+++ b/src/dev/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp
--- a/src/dev/vulkan/shaders/padding.comp
+++ b/src/dev/vulkan/shaders/padding.comp
--- a/src/dev/vulkan/shaders/padding_pack4.comp
+++ b/src/dev/vulkan/shaders/padding_pack4.comp
--- a/src/dev/vulkan/shaders/padding_pack8.comp
+++ b/src/dev/vulkan/shaders/padding_pack8.comp
--- a/src/dev/vulkan/shaders/permute.comp
+++ b/src/dev/vulkan/shaders/permute.comp
--- a/src/dev/vulkan/shaders/permute_pack1to4.comp
+++ b/src/dev/vulkan/shaders/permute_pack1to4.comp
--- a/src/dev/vulkan/shaders/permute_pack1to8.comp
+++ b/src/dev/vulkan/shaders/permute_pack1to8.comp
--- a/src/dev/vulkan/shaders/permute_pack4.comp
+++ b/src/dev/vulkan/shaders/permute_pack4.comp
--- a/src/dev/vulkan/shaders/permute_pack4to1.comp
+++ b/src/dev/vulkan/shaders/permute_pack4to1.comp
--- a/src/dev/vulkan/shaders/permute_pack4to8.comp
+++ b/src/dev/vulkan/shaders/permute_pack4to8.comp
--- a/src/dev/vulkan/shaders/permute_pack8.comp
+++ b/src/dev/vulkan/shaders/permute_pack8.comp
--- a/src/dev/vulkan/shaders/permute_pack8to1.comp
+++ b/src/dev/vulkan/shaders/permute_pack8to1.comp
--- a/src/dev/vulkan/shaders/permute_pack8to4.comp
+++ b/src/dev/vulkan/shaders/permute_pack8to4.comp
--- a/src/dev/vulkan/shaders/pooling.comp
+++ b/src/dev/vulkan/shaders/pooling.comp
--- a/src/dev/vulkan/shaders/pooling_global.comp
+++ b/src/dev/vulkan/shaders/pooling_global.comp
--- a/src/dev/vulkan/shaders/pooling_global_pack4.comp
+++ b/src/dev/vulkan/shaders/pooling_global_pack4.comp
--- a/src/dev/vulkan/shaders/pooling_global_pack8.comp
+++ b/src/dev/vulkan/shaders/pooling_global_pack8.comp
--- a/src/dev/vulkan/shaders/pooling_pack4.comp
+++ b/src/dev/vulkan/shaders/pooling_pack4.comp
--- a/src/dev/vulkan/shaders/pooling_pack8.comp
+++ b/src/dev/vulkan/shaders/pooling_pack8.comp
--- a/src/dev/vulkan/shaders/priorbox.comp
+++ b/src/dev/vulkan/shaders/priorbox.comp
--- a/src/dev/vulkan/shaders/priorbox_mxnet.comp
+++ b/src/dev/vulkan/shaders/priorbox_mxnet.comp
--- a/src/dev/vulkan/shaders/relu.comp
+++ b/src/dev/vulkan/shaders/relu.comp
--- a/src/dev/vulkan/shaders/relu_pack4.comp
+++ b/src/dev/vulkan/shaders/relu_pack4.comp
--- a/src/dev/vulkan/shaders/relu_pack8.comp
+++ b/src/dev/vulkan/shaders/relu_pack8.comp
--- a/src/dev/vulkan/shaders/reshape.comp
+++ b/src/dev/vulkan/shaders/reshape.comp
--- a/src/dev/vulkan/shaders/reshape_pack1to4.comp
+++ b/src/dev/vulkan/shaders/reshape_pack1to4.comp
--- a/src/dev/vulkan/shaders/reshape_pack1to8.comp
+++ b/src/dev/vulkan/shaders/reshape_pack1to8.comp
--- a/src/dev/vulkan/shaders/reshape_pack4.comp
+++ b/src/dev/vulkan/shaders/reshape_pack4.comp
--- a/src/dev/vulkan/shaders/reshape_pack4to1.comp
+++ b/src/dev/vulkan/shaders/reshape_pack4to1.comp
--- a/src/dev/vulkan/shaders/reshape_pack4to8.comp
+++ b/src/dev/vulkan/shaders/reshape_pack4to8.comp
--- a/src/dev/vulkan/shaders/reshape_pack8.comp
+++ b/src/dev/vulkan/shaders/reshape_pack8.comp
--- a/src/dev/vulkan/shaders/reshape_pack8to1.comp
+++ b/src/dev/vulkan/shaders/reshape_pack8to1.comp
--- a/src/dev/vulkan/shaders/reshape_pack8to4.comp
+++ b/src/dev/vulkan/shaders/reshape_pack8to4.comp
--- a/src/dev/vulkan/shaders/softmax_div_sum.comp
+++ b/src/dev/vulkan/shaders/softmax_div_sum.comp
--- a/src/dev/vulkan/shaders/softmax_div_sum_pack4.comp
+++ b/src/dev/vulkan/shaders/softmax_div_sum_pack4.comp
--- a/src/dev/vulkan/shaders/softmax_div_sum_pack8.comp
+++ b/src/dev/vulkan/shaders/softmax_div_sum_pack8.comp
--- a/src/dev/vulkan/shaders/softmax_exp_sub_max.comp
+++ b/src/dev/vulkan/shaders/softmax_exp_sub_max.comp
--- a/src/dev/vulkan/shaders/softmax_exp_sub_max_pack4.comp
+++ b/src/dev/vulkan/shaders/softmax_exp_sub_max_pack4.comp
--- a/src/dev/vulkan/shaders/softmax_exp_sub_max_pack8.comp
+++ b/src/dev/vulkan/shaders/softmax_exp_sub_max_pack8.comp
--- a/src/dev/vulkan/shaders/softmax_reduce_max.comp
+++ b/src/dev/vulkan/shaders/softmax_reduce_max.comp
--- a/src/dev/vulkan/shaders/softmax_reduce_max_pack4.comp
+++ b/src/dev/vulkan/shaders/softmax_reduce_max_pack4.comp
--- a/src/dev/vulkan/shaders/softmax_reduce_max_pack8.comp
+++ b/src/dev/vulkan/shaders/softmax_reduce_max_pack8.comp
--- a/src/dev/vulkan/shaders/softmax_reduce_sum.comp
+++ b/src/dev/vulkan/shaders/softmax_reduce_sum.comp
--- a/src/dev/vulkan/shaders/softmax_reduce_sum_pack4.comp
+++ b/src/dev/vulkan/shaders/softmax_reduce_sum_pack4.comp
--- a/src/dev/vulkan/shaders/softmax_reduce_sum_pack8.comp
+++ b/src/dev/vulkan/shaders/softmax_reduce_sum_pack8.comp
--- a/src/dev/vulkan/vk_allocator.cpp
+++ b/src/dev/vulkan/vk_allocator.cpp
--- a/src/dev/vulkan/vk_allocator.hpp
+++ b/src/dev/vulkan/vk_allocator.hpp
--- a/src/dev/vulkan/vk_device.cpp
+++ b/src/dev/vulkan/vk_device.cpp
--- a/src/dev/vulkan/vk_device.hpp
+++ b/src/dev/vulkan/vk_device.hpp
--- a/src/dev/vulkan/vulkan_allocator.cpp
+++ b/src/dev/vulkan/vulkan_allocator.cpp
--- a/src/dev/vulkan/vulkan_allocator.hpp
+++ b/src/dev/vulkan/vulkan_allocator.hpp
--- a/src/dev/vulkan/vulkan_command.cpp
+++ b/src/dev/vulkan/vulkan_command.cpp
--- a/src/dev/vulkan/vulkan_command.hpp
+++ b/src/dev/vulkan/vulkan_command.hpp
--- a/src/dev/vulkan/vulkan_gpu.cpp
+++ b/src/dev/vulkan/vulkan_gpu.cpp
--- a/src/dev/vulkan/vulkan_gpu.hpp
+++ b/src/dev/vulkan/vulkan_gpu.hpp
--- a/src/dev/vulkan/vulkan_graph.cpp
+++ b/src/dev/vulkan/vulkan_graph.cpp
--- a/src/dev/vulkan/vulkan_graph.hpp
+++ b/src/dev/vulkan/vulkan_graph.hpp
--- a/src/dev/vulkan/vulkan_layer.cpp
+++ b/src/dev/vulkan/vulkan_layer.cpp
--- a/src/dev/vulkan/vulkan_layer.hpp
+++ b/src/dev/vulkan/vulkan_layer.hpp
--- a/src/dev/vulkan/vulkan_option.cpp
+++ b/src/dev/vulkan/vulkan_option.cpp
--- a/src/dev/vulkan/vulkan_option.hpp
+++ b/src/dev/vulkan/vulkan_option.hpp
--- a/src/dev/vulkan/vulkan_pipeline.cpp
+++ b/src/dev/vulkan/vulkan_pipeline.cpp
--- a/src/dev/vulkan/vulkan_pipeline.hpp
+++ b/src/dev/vulkan/vulkan_pipeline.hpp
--- a/src/dev/vulkan/vulkan_platform.hpp
+++ b/src/dev/vulkan/vulkan_platform.hpp
--- a/src/dev/vulkan/vulkan_tensor.cpp
+++ b/src/dev/vulkan/vulkan_tensor.cpp
--- a/src/dev/vulkan/vulkan_tensor.hpp
+++ b/src/dev/vulkan/vulkan_tensor.hpp