diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index 66b2778ef613bfa43de5fe75080e68283cf845a6..094a4b4c4c58788822d1bab4a711a4f2123eb316 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -11,27 +11,44 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
                              __private const int batch_height,
                              __private const int batch_width) {
   const int chan_idx = get_global_id(0);
-  const int batch_w_idx = get_global_id(1);
+  const int batch_w_idx = mul24(get_global_id(1), 4);
   const int batch_hb_idx = get_global_id(2);
 
   const int batch_b_idx = batch_hb_idx / batch_height;
   const int batch_h_idx = batch_hb_idx % batch_height;
 
-  const int block_size = block_height * block_width;
+  const int block_size = mul24(block_height, block_width);
   const int space_b_idx = batch_b_idx / block_size;
   const int remaining_batch_idx = batch_b_idx % block_size;
   const int space_h_idx = (remaining_batch_idx / block_width) +
-      batch_h_idx * block_height - padding_height;
-  const int space_w_idx = (remaining_batch_idx % block_width) +
-      batch_w_idx * block_width - padding_width;
+      mul24(batch_h_idx, block_height) - padding_height;
+  int space_w_idx = (remaining_batch_idx % block_width) +
+      mul24(batch_w_idx, block_width) - padding_width;
+
+  int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
+                            mul24(space_b_idx, space_height) + space_h_idx);
+  DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord);
+
+  int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
+  WRITE_IMAGET(batch_data, batch_coord, value);
+
+  space_coord.x += block_width;
+  value = READ_IMAGET(space_data, SAMPLER, space_coord);
+
+  batch_coord.x += 1;
+  WRITE_IMAGET(batch_data, batch_coord, value);
+
+  space_coord.x += block_width;
+  value = READ_IMAGET(space_data, SAMPLER, space_coord);
+
+  batch_coord.x += 1;
+  WRITE_IMAGET(batch_data, batch_coord, value);
 
-  DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER,
-                                 (int2)((chan_idx * space_width) + space_w_idx,
-                                 (space_b_idx * space_height) + space_h_idx));
+  space_coord.x += block_width;
+  value = READ_IMAGET(space_data, SAMPLER, space_coord);
 
-  WRITE_IMAGET(batch_data,
-               (int2)((chan_idx * batch_width) + batch_w_idx, batch_hb_idx),
-               value);
+  batch_coord.x += 1;
+  WRITE_IMAGET(batch_data, batch_coord, value);
 }
 
 __kernel void batch_to_space(__read_only image2d_t batch_data,
@@ -45,25 +62,42 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
                              __private const int batch_height,
                              __private const int batch_width) {
   const int chan_idx = get_global_id(0);
-  const int batch_w_idx = get_global_id(1);
+  const int batch_w_idx = mul24(get_global_id(1), 4);
   const int batch_hb_idx = get_global_id(2);
 
   const int batch_b_idx = batch_hb_idx / batch_height;
   const int batch_h_idx = batch_hb_idx % batch_height;
 
-  const int block_size = block_height * block_width;
+  const int block_size = mul24(block_height, block_width);
   const int space_b_idx = batch_b_idx / block_size;
   const int remaining_batch_idx = batch_b_idx % block_size;
   const int space_h_idx = (remaining_batch_idx / block_width) +
-      batch_h_idx * block_height - padding_height;
+      mul24(batch_h_idx, block_height) - padding_height;
   const int space_w_idx = (remaining_batch_idx % block_width) +
-      batch_w_idx * block_width - padding_width;
+      mul24(batch_w_idx, block_width) - padding_width;
+
+  int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
+  DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
+
+  int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
+                            mul24(space_b_idx, space_height) + space_h_idx);
+  WRITE_IMAGET(space_data, space_coord, value);
+
+  batch_coord.x += 1;
+  value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
+
+  space_coord.x += block_width;
+  WRITE_IMAGET(space_data, space_coord, value);
+
+  batch_coord.x += 1;
+  value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
+
+  space_coord.x += block_width;
+  WRITE_IMAGET(space_data, space_coord, value);
 
-  DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER,
-                                 (int2)((chan_idx * batch_width) + batch_w_idx, batch_hb_idx));
+  batch_coord.x += 1;
+  value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
 
-  WRITE_IMAGET(space_data,
-               (int2)((chan_idx * space_width) + space_w_idx,
-                      (space_b_idx * space_height) + space_h_idx),
-               value);
+  space_coord.x += block_width;
+  WRITE_IMAGET(space_data, space_coord, value);
 }
diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
similarity index 98%
rename from mace/kernels/opencl/space_to_batch_opecl.cc
rename to mace/kernels/opencl/space_to_batch_opencl.cc
index 4a8fb2b8810bb620e768d5d8364b0f6b2206a2d6..e0394a470022e7e3d0fae1f9a08eda5e4b80da5c 100644
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -54,6 +54,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
   s2b_kernel.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
 
   const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
+//  const uint32_t width_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(2));
   const uint32_t gws[3] = {chan_blk,
                            static_cast<uint32_t>(batch_tensor->dim(2)),
                            static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
diff --git a/tools/gcn.config b/tools/gcn.config
new file mode 100644
index 0000000000000000000000000000000000000000..304d7a2931ee288619cb08d99193828d2cd2cc9a
--- /dev/null
+++ b/tools/gcn.config
@@ -0,0 +1,2 @@
+TF_INPUT_NODE=input
+TF_OUTPUT_NODE=GCN/br_result_2/fcn_br
\ No newline at end of file
diff --git a/tools/side_gcn.config b/tools/side_gcn.config
new file mode 100644
index 0000000000000000000000000000000000000000..d22d730bac70cce3f5c665b5c83c56334f1de319
--- /dev/null
+++ b/tools/side_gcn.config
@@ -0,0 +1,2 @@
+TF_INPUT_NODE=input_node
+TF_OUTPUT_NODE=GCN/br_result_x/fcn_br
\ No newline at end of file
diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh
index a0f8e580ba174664e53dada6f9f0b83a3466c6d6..1359a356bc84b89b6c711d2ab1e2108e4ddb99d3 100644
--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -2,7 +2,7 @@
 # Must run at root dir of mace project.
 set +x
 Usage() {
-  echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size [tuning]'
+  echo 'Usage: bash tools/validate_gcn.sh tools/gcn.config tf_model_path image_size [tuning]'
 }
 
 if [ $# -lt 2 ];then
@@ -10,8 +10,10 @@ if [ $# -lt 2 ];then
   exit -1
 fi
 
+source $1
+
 VLOG_LEVEL=0
-TF_MODEL_FILE_PATH=$1
+TF_MODEL_FILE_PATH=$2
 MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
 MACE_SOURCE_DIR=`/bin/pwd`
 MACE_MODEL_NAME='mace_model.pb'
@@ -20,14 +22,14 @@ OUTPUT_FILE_NAME='gcn.out'
 OUTPUT_LIST_FILE='gcn.list'
 PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}"
 KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
-IMAGE_SIZE=$2
+IMAGE_SIZE=$3
 MODEL_TAG=GCN${IMAGE_SIZE}
 CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen
 MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/gcn-$IMAGE_SIZE
 CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl
 CL_BIN_DIR=${CODEGEN_DIR}/opencl_bin
 TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning
-TUNING_OR_NOT=${3:-0}
+TUNING_OR_NOT=${4:-0}
 VERSION_SOURCE_PATH=${CODEGEN_DIR}/version
 
 build_and_run()
@@ -87,8 +89,8 @@ rm -rf ${MODEL_CODEGEN_DIR}
 mkdir -p ${MODEL_CODEGEN_DIR}
 bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
                                          --output=${MODEL_CODEGEN_DIR}/mace_gcn${IMAGE_SIZE}.cc \
-                                         --input_node=input \
-                                         --output_node=GCN/br_result_2/fcn_br \
+                                         --input_node=${TF_INPUT_NODE} \
+                                         --output_node=${TF_OUTPUT_NODE} \
                                          --data_type=DT_HALF \
                                          --runtime=gpu \
                                          --output_type=source \
@@ -129,7 +131,7 @@ echo "Step 9: Validate the result"
 python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
     --input_file ${MODEL_DIR}/${INPUT_FILE_NAME} \
     --mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \
-    --input_node input \
-    --output_node GCN/br_result_2/fcn_br\
+    --input_node ${TF_INPUT_NODE} \
+    --output_node ${TF_OUTPUT_NODE} \
     --input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \
     --output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2"
diff --git a/tools/validate_side_gcn.sh b/tools/validate_side_gcn.sh
deleted file mode 100644
index ef27f54d47e7885b4619d124ebd47b8b5c536058..0000000000000000000000000000000000000000
--- a/tools/validate_side_gcn.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/bin/bash
-# Must run at root dir of mace project.
-set +x
-Usage() {
-  echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size [tuning]'
-}
-
-if [ $# -lt 2 ];then
-  Usage
-  exit -1
-fi
-
-TF_MODEL_FILE_PATH=$1
-MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
-MACE_SOURCE_DIR=`/bin/pwd`
-MACE_MODEL_NAME='mace_model.pb'
-INPUT_FILE_NAME='model_input'
-OUTPUT_FILE_NAME='gcn.out'
-OUTPUT_LIST_FILE='gcn.list'
-PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}"
-KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
-IMAGE_SIZE=$2
-MODEL_TAG=GCN${IMAGE_SIZE}
-CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen
-MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/gcn-$IMAGE_SIZE
-CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl
-CL_BIN_DIR=${CODEGEN_DIR}/opencl_bin
-TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning
-TUNING_OR_NOT=${3:-0}
-VERSION_SOURCE_PATH=${CODEGEN_DIR}/version
-
-build_and_run()
-{
-  EMBED_OPENCL_BINARY=$1
-  if [ "$EMBED_OPENCL_BINARY" = true ]; then
-    EMBED_OPENCL_BINARY_BUILD_FLAGS="--define embed_binary_program=true"
-  fi
-
-  bazel build -c opt --strip always mace/examples:mace_run \
-    --crosstool_top=//external:android/crosstool \
-    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-    --cpu=arm64-v8a \
-    $EMBED_OPENCL_BINARY_BUILD_FLAGS \
-    --copt=-DMACE_MODEL_FUNCTION=Create${MODEL_TAG}
-
-  adb shell "mkdir -p ${PHONE_DATA_DIR}"
-  if [ "$EMBED_OPENCL_BINARY" = false ]; then
-    adb shell "mkdir -p ${KERNEL_DIR}"
-    adb push mace/kernels/opencl/cl/. ${KERNEL_DIR}
-  fi
-  adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
-  adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
-
-  if [[ "${TUNING_OR_NOT}" != "0" && "$EMBED_OPENCL_BINARY" != true ]];then
-    tuning_flag=1
-    round=0 # only warm up
-  else
-    tuning_flag=0
-    round=2
-  fi
-
-  adb </dev/null shell MACE_TUNING=${tuning_flag} \
-    MACE_CPP_MIN_VLOG_LEVEL=0 \
-    MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
-    MACE_KERNEL_PATH=$KERNEL_DIR \
-    ${PHONE_DATA_DIR}/mace_run \
-    --model=${PHONE_DATA_DIR}/${MACE_MODEL_NAME} \
-    --input=mace_input_node \
-    --output=mace_output_node \
-    --input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\
-    --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
-    --output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
-    --device=OPENCL   \
-    --round=$round
-}
-
-echo "Step 1: Generate input data"
-python tools/validate.py --generate_data true --random_seed 1 \
- --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
- --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"
-
-echo "Step 2: Convert tf model to mace model and optimize memory"
-bazel build //mace/python/tools:tf_converter
-rm -rf ${CODEGEN_DIR}/models
-mkdir -p ${MODEL_CODEGEN_DIR}
-bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
-                                         --output=${MODEL_CODEGEN_DIR}/mace_gcn${IMAGE_SIZE}.cc \
-                                         --input_node=input_node \
-                                         --output_node=GCN/br_result_x/fcn_br \
-                                         --data_type=DT_HALF \
-                                         --runtime=gpu \
-                                         --output_type=source \
-                                         --template=${MACE_SOURCE_DIR}/mace/python/tools/model.template \
-                                         --model_tag=${MODEL_TAG} \
-                                         --confuse=True
-
-echo "Step 3: Generate version source"
-rm -rf ${VERSION_SOURCE_PATH}
-mkdir -p ${VERSION_SOURCE_PATH}
-bash mace/tools/git/gen_version_source.sh ${VERSION_SOURCE_PATH}/version.cc
-
-echo "Step 4: Run model on the phone with files"
-build_and_run false
-
-echo "Step 5: Generate OpenCL binary program and config code"
-rm -rf ${CL_BIN_DIR}
-adb pull ${KERNEL_DIR} ${CL_BIN_DIR}
-rm -rf ${CL_CODEGEN_DIR}
-mkdir -p ${CL_CODEGEN_DIR}
-python mace/python/tools/opencl_codegen.py \
-  --cl_binary_dir=${CL_BIN_DIR} --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc
-
-echo "Step 6: Generate tuning source file"
-adb pull ${PHONE_DATA_DIR}/mace_run.config ${CL_BIN_DIR}
-mkdir -p ${TUNING_CODEGEN_DIR}
-python mace/python/tools/binary_codegen.py \
-  --binary_file=${CL_BIN_DIR}/mace_run.config --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
-
-echo "Step 7: Run model on the phone using binary"
-build_and_run true
-
-echo "Step 8: Pull the mace run result."
-rm -rf ${MODEL_DIR}/${OUTPUT_FILE_NAME}
-adb </dev/null pull ${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} ${MODEL_DIR}
-
-echo "Step 9: Validate the result"
-python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
-    --input_file ${MODEL_DIR}/${INPUT_FILE_NAME} \
-    --mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \
-    --input_node input_node \
-    --output_node GCN/br_result_x/fcn_br\
-    --input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \
-    --output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2"