diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index 1c001c23b747b94ca5598460ad02860efe729969..ab702b2340fddda0e9d932c054d14e2cbe601a8b 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -66,7 +66,7 @@ class OpenCLBufferTransformer {
         VLOG(2) << "Transform CPU Buffer " << input->name()
                 << " to GPU Buffer " << internal_tensor->name()
                 << " with data type " << dt;
-        if (data_format == DataFormat::NHWC && input->shape().size() == 4) {
+        if (data_format == DataFormat::NCHW && input->shape().size() == 4) {
           // 1. (NCHW -> NHWC)
           std::vector<int> dst_dims = {0, 2, 3, 1};
           std::vector<index_t> output_shape =
diff --git a/mace/ops/prior_box.cc b/mace/ops/prior_box.cc
index 1bf8ec397d218333d66babc06ed2fc09e9c66e9f..a9ac07732f29d38721997cec5db35b1de388f09b 100644
--- a/mace/ops/prior_box.cc
+++ b/mace/ops/prior_box.cc
@@ -72,10 +72,9 @@ class PriorBoxOp : public Operation {
     Tensor::MappingGuard output_guard(output);
     T *output_data = output->mutable_data<T>();
     float box_w, box_h;
-#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t i = 0; i < input_h; ++i) {
+      index_t idx = i * input_w * num_prior * 4;
       for (index_t j = 0; j < input_w; ++j) {
-        index_t idx = i * input_w * num_prior * 4;
         float center_y = (offset_ + i) * step_h;
         float center_x = (offset_ + j) * step_w;
         for (index_t k = 0; k < num_min_size; ++k) {
diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc
index 9d8f9ad6dd4770316afdddd1b038ad4d42808475..bb05a065e4828e90869a4b222625b5cae0e2a369 100644
--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
@@ -77,6 +77,14 @@ class ReshapeOp : public Operation {
     }
 
     Tensor *output = this->Output(OUTPUT);
+    // NCHW -> NHWC
+    if (D == DeviceType::GPU && out_shape.size() == 4) {
+      std::vector<int> dst_dims = {0, 2, 3, 1};
+      std::vector<index_t> out_shape_gpu = TransposeShape<index_t, index_t>(
+          out_shape, dst_dims);
+      out_shape = out_shape_gpu;
+    }
+
     output->ReuseTensorBuffer(*input);
     output->Reshape(out_shape);
 
diff --git a/tools/common.py b/tools/common.py
index 0f080f5b1dc319c881c73e32fd7a8e5d0e349f14..bfc56dc11d4744d800f8fdeac18a14027936ab2b 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -209,6 +209,30 @@ def sha256_checksum(fname):
     return hash_func.hexdigest()
 
 
+def get_dockerfile_file(dockerfile_path="",
+                        dockerfile_sha256_checksum=""):
+    dockerfile = dockerfile_path
+    if dockerfile_path.startswith("http://") or \
+            dockerfile_path.startswith("https://"):
+        dockerfile = \
+            "third_party/caffe/" + md5sum(dockerfile_path) + "/Dockerfile"
+        if not os.path.exists(dockerfile) or \
+                sha256_checksum(dockerfile) != dockerfile_sha256_checksum:
+            os.makedirs(dockerfile.strip("/Dockerfile"))
+            MaceLogger.info("Downloading Dockerfile, please wait ...")
+            six.moves.urllib.request.urlretrieve(dockerfile_path, dockerfile)
+            MaceLogger.info("Dockerfile downloaded successfully.")
+
+    if dockerfile:
+        if sha256_checksum(dockerfile) != dockerfile_sha256_checksum:
+            MaceLogger.error(ModuleName.MODEL_CONVERTER,
+                             "Dockerfile sha256checksum not match")
+    else:
+        dockerfile = "third_party/caffe"
+
+    return dockerfile
+
+
 def get_model_files(model_file_path,
                     model_sha256_checksum,
                     model_output_dir,
@@ -373,6 +397,8 @@ class YAMLKeyword(object):
     graph_optimize_options = 'graph_optimize_options'  # internal use for now
     cl_mem_type = 'cl_mem_type'
     backend = 'backend'
+    dockerfile_path = 'dockerfile_path'
+    dockerfile_sha256_checksum = 'dockerfile_sha256_checksum'
 
 
 ################################
diff --git a/tools/device.py b/tools/device.py
index 217f179c683615aa64c30337e2ba9a825a6c18f1..214fb04a479de44ce2ce836aab51e62ca7684af2 100644
--- a/tools/device.py
+++ b/tools/device.py
@@ -624,11 +624,18 @@ class DeviceWrapper:
                         validate_type = device_type
                         if model_config[YAMLKeyword.quantize] == 1:
                             validate_type = device_type + '_QUANTIZE'
+
+                        dockerfile_path = get_dockerfile_file(
+                            model_config.get(YAMLKeyword.dockerfile_path),
+                            model_config.get(YAMLKeyword.dockerfile_sha256_checksum)  # noqa
+                        ) if YAMLKeyword.dockerfile_path in model_config else "third_party/caffe"  # noqa
+
                         sh_commands.validate_model(
                             abi=target_abi,
                             device=self,
                             model_file_path=model_file_path,
                             weight_file_path=weight_file_path,
+                            dockerfile_path=dockerfile_path,
                             platform=model_config[YAMLKeyword.platform],
                             device_type=device_type,
                             input_nodes=subgraphs[0][
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 1b928dc235f813b28ea164a6965dcca2b14a26df..6e5e530e92c13c8db06dd33a6a6af774f8156af8 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -627,6 +627,7 @@ def validate_model(abi,
                    device,
                    model_file_path,
                    weight_file_path,
+                   dockerfile_path,
                    platform,
                    device_type,
                    input_nodes,
@@ -690,7 +691,7 @@ def validate_model(abi,
             if not docker_image_id:
                 six.print_("Build caffe docker")
                 sh.docker("build", "-t", image_name,
-                          "third_party/caffe")
+                          dockerfile_path)
 
             container_id = sh.docker("ps", "-qa", "-f",
                                      "name=%s" % container_name)
diff --git a/tools/validate.py b/tools/validate.py
index fc123e2f8c006cd6248d5ea4a886e90d25e632b9..2ea8fed2786b37ef2950deb706482d284cace6fd 100644
--- a/tools/validate.py
+++ b/tools/validate.py
@@ -357,6 +357,11 @@ def parse_args():
         type=str,
         default="tensorflow",
         help="onnx backend framwork")
+    parser.add_argument(
+        "--log_file",
+        type=str,
+        default="",
+        help="log file")
 
     return parser.parse_known_args()
 
@@ -375,4 +380,5 @@ if __name__ == '__main__':
              FLAGS.output_node,
              FLAGS.validation_threshold,
              FLAGS.input_data_type,
-             FLAGS.backend)
+             FLAGS.backend,
+             FLAGS.log_file)