diff --git a/dnn/src/cuda/utils.cpp b/dnn/src/cuda/utils.cpp
index be8175d19711e42fae92d1115b645d78b69aa004..52335d612a13339fa5de7030688b7dde7e833fb3 100644
--- a/dnn/src/cuda/utils.cpp
+++ b/dnn/src/cuda/utils.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 #include "src/cuda/utils.cuh"
 #include "src/cuda/utils.h"
@@ -30,49 +31,48 @@ struct DevicePropRec {
 constexpr int MAX_NR_DEVICE = 32;
 DevicePropRec device_prop_rec[MAX_NR_DEVICE];
 
-const char *cublasGetErrorString(cublasStatus_t error) {
-	switch (error)
-	{
-		case CUBLAS_STATUS_SUCCESS:
-			return "CUBLAS_STATUS_SUCCESS";
-		case CUBLAS_STATUS_NOT_INITIALIZED:
-			return "CUBLAS_STATUS_NOT_INITIALIZED";
-		case CUBLAS_STATUS_ALLOC_FAILED:
-			return "CUBLAS_STATUS_ALLOC_FAILED";
-		case CUBLAS_STATUS_INVALID_VALUE:
-			return "CUBLAS_STATUS_INVALID_VALUE";
-		case CUBLAS_STATUS_ARCH_MISMATCH:
-			return "CUBLAS_STATUS_ARCH_MISMATCH";
-		case CUBLAS_STATUS_MAPPING_ERROR:
-			return "CUBLAS_STATUS_MAPPING_ERROR";
-		case CUBLAS_STATUS_EXECUTION_FAILED:
-			return "CUBLAS_STATUS_EXECUTION_FAILED";
-		case CUBLAS_STATUS_INTERNAL_ERROR:
-			return "CUBLAS_STATUS_INTERNAL_ERROR";
-		case CUBLAS_STATUS_LICENSE_ERROR:
-			return "CUBLAS_STATUS_LICENSE_ERROR";
-		case CUBLAS_STATUS_NOT_SUPPORTED:
-			return "CUBLAS_STATUS_NOT_SUPPORTED";
-	}
-	return "Unknown CUBLAS error";
+const char* cublasGetErrorString(cublasStatus_t error) {
+    switch (error) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+    }
+    return "Unknown CUBLAS error";
 }
-} // anonymous namespace
+}  // anonymous namespace
 
-void cuda::__throw_cuda_error__(cudaError_t err, const char *msg) {
+void cuda::__throw_cuda_error__(cudaError_t err, const char* msg) {
     auto s = ssprintf("cuda error %s(%d) occurred; expr: %s",
-            cudaGetErrorString(err), int(err), msg);
+                      cudaGetErrorString(err), int(err), msg);
     megdnn_throw(s.c_str());
 }
 
-void cuda::__throw_cudnn_error__(cudnnStatus_t err, const char *msg) {
+void cuda::__throw_cudnn_error__(cudnnStatus_t err, const char* msg) {
     auto s = ssprintf("cudnn error %s(%d) occurred; expr: %s",
-            cudnnGetErrorString(err), int(err), msg);
+                      cudnnGetErrorString(err), int(err), msg);
     megdnn_throw(s.c_str());
 }
 
-void cuda::__throw_cublas_error__(cublasStatus_t err, const char *msg) {
+void cuda::__throw_cublas_error__(cublasStatus_t err, const char* msg) {
     auto s = ssprintf("cublas error %s(%d) occurred; expr: %s",
-            cublasGetErrorString(err), int(err), msg);
+                      cublasGetErrorString(err), int(err), msg);
     megdnn_throw(s.c_str());
 }
 
@@ -92,17 +92,17 @@ void cuda::__throw_cutlass_error__(cutlass::Status err, const char* msg) {
     megdnn_throw(s.c_str());
 }
 
-void cuda::report_error(const char *msg) {
+void cuda::report_error(const char* msg) {
     megdnn_throw(msg);
     MEGDNN_MARK_USED_VAR(msg);
 }
 
 uint32_t cuda::safe_size_in_kern(size_t size) {
     if (!size || size > Uint32Fastdiv::MAX_DIVIDEND) {
-        megdnn_throw(ssprintf(
-                    "invalid size for element-wise kernel: %zu; "
-                    "max supported size is %u",
-                    size, Uint32Fastdiv::MAX_DIVIDEND));
+        megdnn_throw(
+                ssprintf("invalid size for element-wise kernel: %zu; "
+                         "max supported size is %u",
+                         size, Uint32Fastdiv::MAX_DIVIDEND));
     }
     return size;
 }
@@ -111,7 +111,7 @@ cudaDeviceProp cuda::current_device_prop() {
     int dev;
     cuda_check(cudaGetDevice(&dev));
     megdnn_assert(dev < MAX_NR_DEVICE, "device number too large: %d", dev);
-    auto &&rec = device_prop_rec[dev];
+    auto&& rec = device_prop_rec[dev];
     if (!rec.init) {
         std::lock_guard<std::mutex> lock(rec.mtx);
         if (!rec.init) {
@@ -137,6 +137,19 @@ size_t cuda::max_batch_x_channel_size() {
     return current_device_prop().maxGridSize[2];
 }
 
+uint32_t cuda::param_buffer_start_address() {
+    auto&& device_prop = current_device_prop();
+    int cap = 10 * device_prop.major + device_prop.minor;
+    // maxwell and pascal: 0x140
+    if (cap >= 50 && cap < 70)
+        return 0x140;
+    // volta ~ ampere: 0x160
+    else if (cap >= 70)
+        return 0x160;
+    megdnn_throw(
+            ssprintf("unsupported cuda compute capability %d", cap).c_str());
+}
+
 const char* cuda::current_device_arch_name() {
     auto&& device_prop = current_device_prop();
     int cap = 10 * device_prop.major + device_prop.minor;
@@ -155,4 +168,3 @@ const char* cuda::current_device_arch_name() {
 }
 
 // vim: syntax=cpp.doxygen
-
diff --git a/dnn/src/cuda/utils.h b/dnn/src/cuda/utils.h
index 542eb6bcd542ac8f03c22594876fd30af5904b8c..05fc028488fa94929174935122f9102e2db0f18c 100644
--- a/dnn/src/cuda/utils.h
+++ b/dnn/src/cuda/utils.h
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 #pragma once
 
@@ -24,19 +25,19 @@
 namespace megdnn {
 namespace cuda {
 
-static inline HandleImpl *concrete_handle(Handle *handle) {
+static inline HandleImpl* concrete_handle(Handle* handle) {
     return static_cast<cuda::HandleImpl*>(handle);
 }
 
-static inline cudnnHandle_t cudnn_handle(Handle *handle) {
+static inline cudnnHandle_t cudnn_handle(Handle* handle) {
     return concrete_handle(handle)->cudnn_handle();
 }
 
-static inline cublasHandle_t cublas_handle(Handle *handle) {
+static inline cublasHandle_t cublas_handle(Handle* handle) {
     return concrete_handle(handle)->cublas_handle();
 }
 
-static inline cudaStream_t cuda_stream(Handle *handle) {
+static inline cudaStream_t cuda_stream(Handle* handle) {
     return concrete_handle(handle)->stream();
 }
 
@@ -44,9 +45,8 @@ static inline megcore::AsyncErrorInfo* async_error_info(Handle* handle) {
     return concrete_handle(handle)->megcore_context().error_info;
 }
 
-static inline void CUDART_CB callback_free(
-        cudaStream_t /* stream */, cudaError_t status, void *userData)
-{
+static inline void CUDART_CB callback_free(cudaStream_t /* stream */,
+                                           cudaError_t status, void* userData) {
     cuda_check(status);
     free(userData);
 }
@@ -64,9 +64,12 @@ bool is_compute_capability_equalto(int major, int minor);
 //! third demension
 size_t max_batch_x_channel_size();
 
+//! get param buffer start address at cmem[0]
+uint32_t param_buffer_start_address();
+
 const char* current_device_arch_name();
 
-} // namespace cuda
-} // namespace megdnn
+}  // namespace cuda
+}  // namespace megdnn
 
 // vim: syntax=cpp.doxygen
diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp
index 030bf12ce2ee6e66042e2d1ed289d3adb9c237e9..14a1b26c7d87fbef46d378ca22b3c930571bbee4 100644
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 #include "test/common/conv_bias.h"
 #include "megdnn/opr_param_defs.h"
@@ -413,7 +414,7 @@ std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
     megdnn_assert(kernel_size > 0, "not support kernel_size");
     using NLMode = param::ConvBias::NonlineMode;
 
-    //// clang-format off
+    // clang-format off
     for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU}) {
     for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) {
     for (size_t b : {1,2}) {
@@ -795,7 +796,7 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype,
         return z;
     };
     megdnn_assert(rng != nullptr && bias_rng != nullptr);
-    checker.set_rng(0,  rng.get())
+    checker.set_rng(0, rng.get())
             .set_rng(1, rng.get())
             .set_rng(2, rng.get())
             .set_rng(3, rng.get());
@@ -1152,8 +1153,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
             handle->create_operator<WinogradFilterPreprocess>();
     winograd_preprocess_opr->param().output_block_size = m;
     winograd_preprocess_opr->param().format = format;
-    winograd_preprocess_opr->param().compute_mode =
-            param.compute_mode;
+    winograd_preprocess_opr->param().compute_mode = param.compute_mode;
     TensorLayout filter_transform_layout;
     winograd_preprocess_opr->deduce_layout(tensors[1].layout,
                                            filter_transform_layout);