diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index 66b2778ef613bfa43de5fe75080e68283cf845a6..094a4b4c4c58788822d1bab4a711a4f2123eb316 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -11,27 +11,44 @@ __kernel void space_to_batch(__read_only image2d_t space_data, __private const int batch_height, __private const int batch_width) { const int chan_idx = get_global_id(0); - const int batch_w_idx = get_global_id(1); + const int batch_w_idx = mul24(get_global_id(1), 4); const int batch_hb_idx = get_global_id(2); const int batch_b_idx = batch_hb_idx / batch_height; const int batch_h_idx = batch_hb_idx % batch_height; - const int block_size = block_height * block_width; + const int block_size = mul24(block_height, block_width); const int space_b_idx = batch_b_idx / block_size; const int remaining_batch_idx = batch_b_idx % block_size; const int space_h_idx = (remaining_batch_idx / block_width) + - batch_h_idx * block_height - padding_height; - const int space_w_idx = (remaining_batch_idx % block_width) + - batch_w_idx * block_width - padding_width; + mul24(batch_h_idx, block_height) - padding_height; + int space_w_idx = (remaining_batch_idx % block_width) + + mul24(batch_w_idx, block_width) - padding_width; + + int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx, + mul24(space_b_idx, space_height) + space_h_idx); + DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord); + + int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx); + WRITE_IMAGET(batch_data, batch_coord, value); + + space_coord.x += block_width; + value = READ_IMAGET(space_data, SAMPLER, space_coord); + + batch_coord.x += 1; + WRITE_IMAGET(batch_data, batch_coord, value); + + space_coord.x += block_width; + value = READ_IMAGET(space_data, SAMPLER, space_coord); + + batch_coord.x += 1; + WRITE_IMAGET(batch_data, batch_coord, value); - DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, - (int2)((chan_idx * space_width) + space_w_idx, - (space_b_idx * space_height) + space_h_idx)); + space_coord.x += block_width; + value = READ_IMAGET(space_data, SAMPLER, space_coord); - WRITE_IMAGET(batch_data, - (int2)((chan_idx * batch_width) + batch_w_idx, batch_hb_idx), - value); + batch_coord.x += 1; + WRITE_IMAGET(batch_data, batch_coord, value); } __kernel void batch_to_space(__read_only image2d_t batch_data, @@ -45,25 +62,42 @@ __kernel void batch_to_space(__read_only image2d_t batch_data, __private const int batch_height, __private const int batch_width) { const int chan_idx = get_global_id(0); - const int batch_w_idx = get_global_id(1); + const int batch_w_idx = mul24(get_global_id(1), 4); const int batch_hb_idx = get_global_id(2); const int batch_b_idx = batch_hb_idx / batch_height; const int batch_h_idx = batch_hb_idx % batch_height; - const int block_size = block_height * block_width; + const int block_size = mul24(block_height, block_width); const int space_b_idx = batch_b_idx / block_size; const int remaining_batch_idx = batch_b_idx % block_size; const int space_h_idx = (remaining_batch_idx / block_width) + - batch_h_idx * block_height - padding_height; + mul24(batch_h_idx, block_height) - padding_height; const int space_w_idx = (remaining_batch_idx % block_width) + - batch_w_idx * block_width - padding_width; + mul24(batch_w_idx, block_width) - padding_width; + + int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx); + DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord); + + int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx, + mul24(space_b_idx, space_height) + space_h_idx); + WRITE_IMAGET(space_data, space_coord, value); + + batch_coord.x += 1; + value = READ_IMAGET(batch_data, SAMPLER, batch_coord); + + space_coord.x += block_width; + WRITE_IMAGET(space_data, space_coord, value); + + batch_coord.x += 1; + value = READ_IMAGET(batch_data, SAMPLER, batch_coord); + + space_coord.x += block_width; + WRITE_IMAGET(space_data, space_coord, value); - DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, - (int2)((chan_idx * batch_width) + batch_w_idx, batch_hb_idx)); + batch_coord.x += 1; + value = READ_IMAGET(batch_data, SAMPLER, batch_coord); - WRITE_IMAGET(space_data, - (int2)((chan_idx * space_width) + space_w_idx, - (space_b_idx * space_height) + space_h_idx), - value); + space_coord.x += block_width; + WRITE_IMAGET(space_data, space_coord, value); } diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc similarity index 98% rename from mace/kernels/opencl/space_to_batch_opecl.cc rename to mace/kernels/opencl/space_to_batch_opencl.cc index 4a8fb2b8810bb620e768d5d8364b0f6b2206a2d6..e0394a470022e7e3d0fae1f9a08eda5e4b80da5c 100644 --- a/mace/kernels/opencl/space_to_batch_opecl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -54,6 +54,7 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor s2b_kernel.setArg(idx++, static_cast(batch_tensor->dim(2))); const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); +// const uint32_t width_blk = RoundUpDiv4(batch_tensor->dim(2)); const uint32_t gws[3] = {chan_blk, static_cast(batch_tensor->dim(2)), static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; diff --git a/tools/gcn.config b/tools/gcn.config new file mode 100644 index 0000000000000000000000000000000000000000..304d7a2931ee288619cb08d99193828d2cd2cc9a --- /dev/null +++ b/tools/gcn.config @@ -0,0 +1,2 @@ +TF_INPUT_NODE=input +TF_OUTPUT_NODE=GCN/br_result_2/fcn_br \ No newline at end of file diff --git a/tools/side_gcn.config b/tools/side_gcn.config new file mode 100644 index 0000000000000000000000000000000000000000..d22d730bac70cce3f5c665b5c83c56334f1de319 --- /dev/null +++ b/tools/side_gcn.config @@ -0,0 +1,2 @@ +TF_INPUT_NODE=input_node +TF_OUTPUT_NODE=GCN/br_result_x/fcn_br \ No newline at end of file diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh index a0f8e580ba174664e53dada6f9f0b83a3466c6d6..1359a356bc84b89b6c711d2ab1e2108e4ddb99d3 100644 --- a/tools/validate_gcn.sh +++ b/tools/validate_gcn.sh @@ -2,7 +2,7 @@ # Must run at root dir of mace project. set +x Usage() { - echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size [tuning]' + echo 'Usage: bash tools/validate_gcn.sh tools/gcn.config tf_model_path image_size [tuning]' } if [ $# -lt 2 ];then @@ -10,8 +10,10 @@ if [ $# -lt 2 ];then exit -1 fi +source $1 + VLOG_LEVEL=0 -TF_MODEL_FILE_PATH=$1 +TF_MODEL_FILE_PATH=$2 MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) MACE_SOURCE_DIR=`/bin/pwd` MACE_MODEL_NAME='mace_model.pb' @@ -20,14 +22,14 @@ OUTPUT_FILE_NAME='gcn.out' OUTPUT_LIST_FILE='gcn.list' PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}" KERNEL_DIR="${PHONE_DATA_DIR}/cl/" -IMAGE_SIZE=$2 +IMAGE_SIZE=$3 MODEL_TAG=GCN${IMAGE_SIZE} CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/gcn-$IMAGE_SIZE CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl CL_BIN_DIR=${CODEGEN_DIR}/opencl_bin TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning -TUNING_OR_NOT=${3:-0} +TUNING_OR_NOT=${4:-0} VERSION_SOURCE_PATH=${CODEGEN_DIR}/version build_and_run() @@ -87,8 +89,8 @@ rm -rf ${MODEL_CODEGEN_DIR} mkdir -p ${MODEL_CODEGEN_DIR} bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ --output=${MODEL_CODEGEN_DIR}/mace_gcn${IMAGE_SIZE}.cc \ - --input_node=input \ - --output_node=GCN/br_result_2/fcn_br \ + --input_node=${TF_INPUT_NODE} \ + --output_node=${TF_OUTPUT_NODE} \ --data_type=DT_HALF \ --runtime=gpu \ --output_type=source \ @@ -129,7 +131,7 @@ echo "Step 9: Validate the result" python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \ --input_file ${MODEL_DIR}/${INPUT_FILE_NAME} \ --mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \ - --input_node input \ - --output_node GCN/br_result_2/fcn_br\ + --input_node ${TF_INPUT_NODE} \ + --output_node ${TF_OUTPUT_NODE} \ --input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \ --output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2" diff --git a/tools/validate_side_gcn.sh b/tools/validate_side_gcn.sh deleted file mode 100644 index ef27f54d47e7885b4619d124ebd47b8b5c536058..0000000000000000000000000000000000000000 --- a/tools/validate_side_gcn.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/bash -# Must run at root dir of mace project. -set +x -Usage() { - echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size [tuning]' -} - -if [ $# -lt 2 ];then - Usage - exit -1 -fi - -TF_MODEL_FILE_PATH=$1 -MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) -MACE_SOURCE_DIR=`/bin/pwd` -MACE_MODEL_NAME='mace_model.pb' -INPUT_FILE_NAME='model_input' -OUTPUT_FILE_NAME='gcn.out' -OUTPUT_LIST_FILE='gcn.list' -PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}" -KERNEL_DIR="${PHONE_DATA_DIR}/cl/" -IMAGE_SIZE=$2 -MODEL_TAG=GCN${IMAGE_SIZE} -CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen -MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/gcn-$IMAGE_SIZE -CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl -CL_BIN_DIR=${CODEGEN_DIR}/opencl_bin -TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning -TUNING_OR_NOT=${3:-0} -VERSION_SOURCE_PATH=${CODEGEN_DIR}/version - -build_and_run() -{ - EMBED_OPENCL_BINARY=$1 - if [ "$EMBED_OPENCL_BINARY" = true ]; then - EMBED_OPENCL_BINARY_BUILD_FLAGS="--define embed_binary_program=true" - fi - - bazel build -c opt --strip always mace/examples:mace_run \ - --crosstool_top=//external:android/crosstool \ - --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ - --cpu=arm64-v8a \ - $EMBED_OPENCL_BINARY_BUILD_FLAGS \ - --copt=-DMACE_MODEL_FUNCTION=Create${MODEL_TAG} - - adb shell "mkdir -p ${PHONE_DATA_DIR}" - if [ "$EMBED_OPENCL_BINARY" = false ]; then - adb shell "mkdir -p ${KERNEL_DIR}" - adb push mace/kernels/opencl/cl/. ${KERNEL_DIR} - fi - adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} - adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} - - if [[ "${TUNING_OR_NOT}" != "0" && "$EMBED_OPENCL_BINARY" != true ]];then - tuning_flag=1 - round=0 # only warm up - else - tuning_flag=0 - round=2 - fi - - adb