提交 dea6154b 编写于 作者: L Liu Yiqun

Merge branch 'develop' into step_rnn/opt_ddim_lite

......@@ -30,7 +30,19 @@
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string(model_dir, "", "model dir");
DEFINE_string(model_dir,
"",
"the path of the model, set model_dir when the model is no "
"combined formate. This option will be ignored if model_file "
"and param_file are exist.");
DEFINE_string(model_file,
"",
"the path of model file, set model_file when the model is "
"combined formate.");
DEFINE_string(param_file,
"",
"the path of param file, set param_file when the model is "
"combined formate.");
DEFINE_string(input_shape,
"1,3,224,224",
"set input shapes according to the model, "
......@@ -68,11 +80,12 @@ inline double GetCurrentUS() {
return 1e+6 * time.tv_sec + time.tv_usec;
}
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir,
void OutputOptModel(const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_model_dir(FLAGS_model_dir);
config.set_model_file(FLAGS_model_file);
config.set_param_file(FLAGS_param_file);
std::vector<Place> vaild_places = {
Place{TARGET(kARM), PRECISION(kFloat)},
};
......@@ -91,7 +104,7 @@ void OutputOptModel(const std::string& load_model_dir,
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load model from " << load_model_dir;
LOG(INFO) << "Load model from " << FLAGS_model_dir;
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
......@@ -146,7 +159,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
LOG(FATAL) << "open result file failed";
}
ofs.precision(5);
ofs << std::setw(20) << std::fixed << std::left << model_name;
ofs << std::setw(30) << std::fixed << std::left << model_name;
ofs << "min = " << std::setw(12) << min_res;
ofs << "max = " << std::setw(12) << max_res;
ofs << "average = " << std::setw(12) << avg_res;
......@@ -209,8 +222,7 @@ int main(int argc, char** argv) {
// Output optimized model if needed
if (FLAGS_run_model_optimize) {
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
paddle::lite_api::OutputOptModel(save_optimized_model_dir, input_shapes);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
......
......@@ -614,8 +614,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"blt 3f \n"
#define LEFT_RESULT_S1_LEAKY_RELU \
"cmhs v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"cmhs v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fcmge v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */ \
"fmul v21.4s, v12.4s, %[vscale].4s \n" /* mul */ \
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
......@@ -639,7 +639,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \
"ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
"cmhs v18.4s, v14.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v14.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v14.4s, %[vscale].4s \n" /* mul */ \
\
"ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
......@@ -657,7 +657,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \
"ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \
\
"cmhs v18.4s, v15.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v15.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v15.4s, %[vscale].4s \n" /* mul */ \
"ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \
"bif v15.16b, v20.16b, v18.16b \n" /* choose*/ \
......@@ -802,7 +802,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
#define MID_RESULT_S1_LEAKY_RELU \
"movi v21.4s, #0 \n" \
"cmhs v18.4s, v12.4s, v21.4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v12.4s, v21.4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */ \
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
......@@ -824,7 +824,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
\
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"cmhs v18.4s, v13.4s, v21.4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v13.4s, v21.4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v13.4s, %[vscale].4s \n" /* mul */ \
\
"fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
......@@ -846,7 +846,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
"ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \
"cmhs v18.4s, v14.4s, v21.4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v14.4s, v21.4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v14.4s, %[vscale].4s \n" /* mul */ \
\
"fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
......@@ -861,7 +861,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \
"st1 {v14.4s}, [%[doutr2]], #16 \n" \
\
"cmhs v18.4s, v15.4s, v21.4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v15.4s, v21.4s \n" /* vcgeq_f32 */ \
"fmul v20.4s, v15.4s, %[vscale].4s \n" /* mul */ \
\
"ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \
......@@ -980,7 +980,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
#define RIGHT_RESULT_S1_LEAKY_RELU \
"movi v1.4s, #0 \n" \
"cmhs v20.4s, v12.4s, v1.4s \n" /* vcgeq_u32 */ \
"fcmge v20.4s, v12.4s, v1.4s \n" /* vcgeq_f32 */ \
"fmul v21.4s, v12.4s, %[vscale].4s \n" /* mul */ \
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
......@@ -999,7 +999,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
"fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
\
"cmhs v20.4s, v13.4s, v1.4s \n" /* vcgeq_u32 */ \
"fcmge v20.4s, v13.4s, v1.4s \n" /* vcgeq_f32 */ \
"fmul v21.4s, v13.4s, %[vscale].4s \n" /* mul */ \
"st1 {v12.4s}, [%[doutr0]], #16 \n" \
\
......@@ -1017,7 +1017,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
\
"cmhs v20.4s, v14.4s, v1.4s \n" /* vcgeq_u32 */ \
"fcmge v20.4s, v14.4s, v1.4s \n" /* vcgeq_f32 */ \
"fmul v21.4s, v14.4s, %[vscale].4s \n" /* mul */ \
"st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \
\
......@@ -1028,7 +1028,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"bif v14.16b, v24.16b, v18.16b \n" \
\
"cmhs v20.4s, v15.4s, v1.4s \n" /* vcgeq_u32 */ \
"fcmge v20.4s, v15.4s, v1.4s \n" /* vcgeq_f32 */ \
"fmul v21.4s, v15.4s, %[vscale].4s \n" /* mul */ \
\
"st1 {v14.4s}, [%[doutr2]], #16 \n" \
......@@ -1132,8 +1132,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"prfm pldl1keep, [%[out1]]\n" \
"prfm pldl1keep, [%[out2]]\n" \
\
"cmhs v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"cmhs v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v18.4s, v12.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v19.4s, v13.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */ \
"fmul v21.4s, v13.4s, %[vscale].4s \n" /* mul */ \
\
......
......@@ -179,11 +179,11 @@ namespace math {
#define LEAKY_RELU \
"movi v0.4s, #0\n" /* for relu */ \
"ldr x0, [%[outl], #88]\n" \
"cmhs v1.4s, v15.4s, v0.4s \n" /* vcgeq_u32 */ \
"cmhs v2.4s, v16.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v1.4s, v15.4s, v0.4s \n" /* vcgeq_f32 */ \
"fcmge v2.4s, v16.4s, v0.4s \n" /* vcgeq_f32 */ \
"ld1 {v9.4s}, [x0] \n" \
"cmhs v3.4s, v17.4s, v0.4s \n" /* vcgeq_u32 */ \
"cmhs v4.4s, v18.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v3.4s, v17.4s, v0.4s \n" /* vcgeq_f32 */ \
"fcmge v4.4s, v18.4s, v0.4s \n" /* vcgeq_f32 */ \
"ldr x0, [%[outl]] \n" \
"fmul v5.4s, v15.4s, v9.4s \n" /* mul */ \
"fmul v6.4s, v16.4s, v9.4s \n" /* mul */ \
......@@ -193,10 +193,10 @@ namespace math {
"bif v16.16b, v6.16b, v2.16b \n" /* choose*/ \
"bif v17.16b, v7.16b, v3.16b \n" /* choose*/ \
"bif v18.16b, v8.16b, v4.16b \n" /* choose*/ \
"cmhs v1.4s, v19.4s, v0.4s \n" /* vcgeq_u32 */ \
"cmhs v2.4s, v20.4s, v0.4s \n" /* vcgeq_u32 */ \
"cmhs v3.4s, v21.4s, v0.4s \n" /* vcgeq_u32 */ \
"cmhs v4.4s, v22.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v1.4s, v19.4s, v0.4s \n" /* vcgeq_f32 */ \
"fcmge v2.4s, v20.4s, v0.4s \n" /* vcgeq_f32 */ \
"fcmge v3.4s, v21.4s, v0.4s \n" /* vcgeq_f32 */ \
"fcmge v4.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ \
"fmul v5.4s, v19.4s, v9.4s \n" /* mul */ \
"fmul v6.4s, v20.4s, v9.4s \n" /* mul */ \
"fmul v7.4s, v21.4s, v9.4s \n" /* mul */ \
......
......@@ -50,7 +50,7 @@ void conv_3x3s2_direct_int8(const int8_t* din,
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias;
int pad_h = paddings[0];
int pad_w = paddings[1];
int pad_w = paddings[2];
const int threads = ctx->threads();
int llc_size = ctx->llc_size() / 4;
......@@ -477,7 +477,7 @@ void conv_3x3s2_direct_int8(const int8_t* din,
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias;
int pad_h = paddings[0];
int pad_w = paddings[1];
int pad_w = paddings[2];
const int threads = ctx->threads();
//! set 1/4 l2 cache
int llc_size = ctx->llc_size() / 4;
......
......@@ -453,7 +453,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
#define LEFT_RESULT_S2_LEAKY_RELU \
"ld1 {v22.4s}, [%[scale_ptr]] \n" \
"cmhs v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
\
"ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \
"ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \
......@@ -475,7 +475,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"cmhs v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v16.4s, v22.4s \n" \
\
"ld1 {v20.4s}, [%[inptr3]] \n" \
......@@ -543,7 +543,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"bne 2b \n"
#define MID_RESULT_S2_LEAKY_RELU \
"cmhs v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v16.4s, v22.4s \n" \
\
"fadd v17.4s, v17.4s, v13.4s \n" \
......@@ -554,7 +554,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
\
"bif v16.16b, v12.16b, v11.16b \n" /* choose*/ \
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
"cmhs v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v17.4s, v22.4s \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
......@@ -607,7 +607,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"4: \n"
#define RIGHT_RESULT_S2_LEAKY_RELU \
"cmhs v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v11.4s, v16.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v16.4s, v22.4s \n" \
"fadd v17.4s, v17.4s, v13.4s \n" \
\
......@@ -617,7 +617,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
\
"bif v16.16b, v0.16b, %[wmask].16b \n" \
\
"cmhs v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v11.4s, v17.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v12.4s, v17.4s, v22.4s \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
......
......@@ -104,13 +104,13 @@ namespace math {
"fmin v22.4s, v22.4s, %[vsix].4s\n"
#define LEAKY_RELU /* LeakyRelu */ \
"movi v0.4s, #0\n" /* for relu */ \
"cmhs v1.4s, v19.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v1.4s, v19.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v2.4s, v19.4s, %[vscale].4s \n" /* mul */ \
"cmhs v3.4s, v20.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v3.4s, v20.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v4.4s, v20.4s, %[vscale].4s \n" /* mul */ \
"cmhs v5.4s, v21.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v5.4s, v21.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v6.4s, v21.4s, %[vscale].4s \n" /* mul */ \
"cmhs v7.4s, v22.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_u32 */ \
"fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \
"bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \
"bif v19.16b, v4.16b, v3.16b \n" /* choose*/ \
......
......@@ -709,7 +709,6 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
"q15");
#endif
// clang-format on
int32_t* ptr_tmp = ptr_out0 - w_loop * 32;
block_inr0 = block_inr1;
block_inr1 = block_inr2;
block_inr2 = block_inr3;
......
......@@ -200,13 +200,13 @@ namespace math {
"fmin v22.4s, v22.4s, %[vsix].4s\n"
#define LEAKY_RELU /* LeakyRelu */ \
"movi v0.4s, #0\n" /* for relu */ \
"cmhs v1.4s, v19.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v1.4s, v19.4s, v0.4s \n" /* vcgeq_f32 */ \
"fmul v2.4s, v19.4s, %[vscale].4s \n" /* mul */ \
"cmhs v3.4s, v20.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v3.4s, v20.4s, v0.4s \n" /* vcgeq_f32 */ \
"fmul v4.4s, v20.4s, %[vscale].4s \n" /* mul */ \
"cmhs v5.4s, v21.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v5.4s, v21.4s, v0.4s \n" /* vcgeq_f32 */ \
"fmul v6.4s, v21.4s, %[vscale].4s \n" /* mul */ \
"cmhs v7.4s, v22.4s, v0.4s \n" /* vcgeq_u32 */ \
"fcmge v7.4s, v22.4s, v0.4s \n" /* vcgeq_f32 */ \
"fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */ \
"bif v19.16b, v2.16b, v1.16b \n" /* choose*/ \
"bif v19.16b, v4.16b, v3.16b \n" /* choose*/ \
......
......@@ -614,10 +614,10 @@ inline void prepack_input_nxwc8_int8_dw(const int8_t* din,
"fmin v3.4s, v3.4s, %[six].4s \n" /* relu6 */
#define NCHWC1_TRANS_FP32_LEAKY_RELU \
"cmhs v4.4s, v0.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v5.4s, v1.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v6.4s, v2.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v7.4s, v3.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v4.4s, v0.4s, v20.4s \n" /* vcgeq_f32 */ \
"fcmge v5.4s, v1.4s, v20.4s \n" /* vcgeq_f32 */ \
"fcmge v6.4s, v2.4s, v20.4s \n" /* vcgeq_f32 */ \
"fcmge v7.4s, v3.4s, v20.4s \n" /* vcgeq_f32 */ \
"fmul v8.4s, v0.4s, %[scale].4s \n" /* mul */ \
"fmul v9.4s, v1.4s, %[scale].4s \n" /* mul */ \
"fmul v10.4s, v2.4s, %[scale].4s \n" /* mul */ \
......@@ -935,8 +935,8 @@ inline bool write_to_output_c1_fp32(const float* din,
"fmin v3.4s, v3.4s, %[six].4s \n" /* relu6 */
#define NCHWC2_TRANS_FP32_LEAKY_RELU \
"cmhs v6.4s, v2.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v7.4s, v3.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v6.4s, v2.4s, v20.4s \n" /* vcgeq_f32 */ \
"fcmge v7.4s, v3.4s, v20.4s \n" /* vcgeq_f32 */ \
"fmul v4.4s, v2.4s, %[scale].4s \n" /* mul */ \
"fmul v5.4s, v3.4s, %[scale].4s \n" /* mul */ \
"bif v2.16b, v4.16b, v6.16b \n" /* choose*/ \
......@@ -1276,10 +1276,10 @@ inline bool write_to_output_c2_fp32(const float* din,
"fmin v19.4s, v19.4s, %[six].4s \n" /* relu6 */
#define NCHWC4_TRANS_FP32_LEAKY_RELU \
"cmhs v8.4s, v16.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v9.4s, v17.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v10.4s, v18.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v11.4s, v19.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v8.4s, v16.4s, v20.4s \n" /* vcgeq_f32 */ \
"fcmge v9.4s, v17.4s, v20.4s \n" /* vcgeq_f32 */ \
"fcmge v10.4s, v18.4s, v20.4s \n" /* vcgeq_f32 */ \
"fcmge v11.4s, v19.4s, v20.4s \n" /* vcgeq_f32 */ \
"fmul v4.4s, v16.4s, %[scale].4s \n" /* mul */ \
"fmul v5.4s, v17.4s, %[scale].4s \n" /* mul */ \
"fmul v6.4s, v18.4s, %[scale].4s \n" /* mul */ \
......@@ -1754,15 +1754,15 @@ inline bool write_to_output_c4_fp32(const float* din,
"fmin v13.4s, v13.4s, %[six].4s \n" /*relu6*/
#define NCHWC8_TRANS_FP32_LEAKY_RELU \
"cmhs v10.4s, v16.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v11.4s, v17.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v14.4s, v18.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v15.4s, v19.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v10.4s, v16.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v11.4s, v17.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v14.4s, v18.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v15.4s, v19.4s, v20.4s \n" /* vcgeq_u32 */ \
\
"cmhs v21.4s, v8.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v22.4s, v9.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v23.4s, v12.4s, v20.4s \n" /* vcgeq_u32 */ \
"cmhs v24.4s, v13.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v21.4s, v8.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v22.4s, v9.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v23.4s, v12.4s, v20.4s \n" /* vcgeq_u32 */ \
"fcmge v24.4s, v13.4s, v20.4s \n" /* vcgeq_u32 */ \
\
"fmul v25.4s, v16.4s, %[scale].4s \n" /* mul */ \
"fmul v26.4s, v17.4s, %[scale].4s \n" /* mul */ \
......@@ -2169,13 +2169,13 @@ inline void act_switch_c8_fp32(const float* din_ptr,
"fmin v2.4s, v2.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v3.4s, v3.4s, %[vsix].4s \n" /* vmaxq_f32() */
#define DO_LEAKY_RELU \
"cmhs v4.4s, v0.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v4.4s, v0.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v5.4s, v0.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v6.4s, v1.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v6.4s, v1.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v7.4s, v1.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v8.4s, v2.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v8.4s, v2.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v9.4s, v2.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v10.4s, v3.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v10.4s, v3.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v11.4s, v3.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"bif v0.16b, v5.16b, v4.16b \n" /* choose*/ \
"bif v1.16b, v7.16b, v6.16b \n" /* choose*/ \
......@@ -2217,7 +2217,7 @@ inline void act_switch_c8_fp32(const float* din_ptr,
"vbif q3, q8, q7 @ choose \n" \
"vbif q4, q10, q9 @ choose \n" \
"vbif q5, q12, q11 @ choose \n" \
"vbif q6, q13, q13 @ choose \n"
"vbif q6, q14, q13 @ choose \n"
#define DO_STORE \
"subs %[cnt], #1 \n" \
"vst1.32 {d6-d7}, [%[dout_ptr]]! @ vst1q_f32() \n" \
......
......@@ -123,20 +123,21 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
int padh,
ARMContext* ctx);
void conv_depthwise_5x5s1_fp32(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
void conv_depthwise_5x5s1_fp32(float* dout,
const float* din,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
int num,
int chin,
int hin,
int win,
int hout,
int wout,
int padw,
int padh,
const operators::ConvParam& param,
ARMContext* ctx);
void conv_depthwise_5x5s2_fp32(const float* din,
......
......@@ -188,7 +188,6 @@ void conv1x1s1_gemm(const float* i_data,
if (n > 1) {
weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
}
//! use gemv when the output channel size = 1
for (int b = 0; b < num; ++b) {
// dC
......@@ -210,8 +209,11 @@ void conv1x1s1_gemm(const float* i_data,
k,
flag_bias,
bias_group,
flag_relu,
ctx);
act_param.has_active,
act_param.active_type,
ctx,
act_param.Relu_clipped_coef,
act_param.Leaky_relu_alpha);
} else {
sgemm_prepack(false,
m,
......@@ -410,8 +412,11 @@ void conv_im2col_gemm(const float* i_data,
k,
flag_bias,
bias_group,
flag_relu,
ctx);
act_param.has_active,
act_param.active_type,
ctx,
act_param.Relu_clipped_coef,
act_param.Leaky_relu_alpha);
} else {
int ldb = n;
sgemm_prepack(false,
......@@ -677,7 +682,8 @@ void conv_depthwise_5x5_fp32(const void* din,
const float* scale) {
auto paddings = *param.paddings;
auto act_param = param.activation_param;
int pad = paddings[0];
int pad_h = paddings[0];
int pad_w = paddings[2];
int stride = param.strides[1];
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
......@@ -698,20 +704,21 @@ void conv_depthwise_5x5_fp32(const void* din,
act_param,
ctx);
} else if (stride == 1) {
conv_depthwise_5x5s1_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
ch_out,
h_out,
w_out,
ch_in,
h_in,
w_in,
conv_depthwise_5x5s1_fp32(reinterpret_cast<float*>(dout),
reinterpret_cast<const float*>(din),
reinterpret_cast<const float*>(weights),
bias,
pad,
flag_bias,
flag_relu,
num,
ch_in,
h_in,
w_in,
h_out,
w_out,
pad_w,
pad_h,
param,
ctx);
} else {
LOG(FATAL) << "unsupport this type 5x5 dw conv";
......
......@@ -137,13 +137,13 @@ void fill_bias_relu<int>(int* tensor,
"fmin v2.4s, v2.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v3.4s, v3.4s, %[vsix].4s \n" /* vmaxq_f32() */
#define FILL_LEAKY_RELU \
"cmhs v4.4s, v0.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v4.4s, v0.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v5.4s, v0.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v6.4s, v1.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v6.4s, v1.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v7.4s, v1.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v8.4s, v2.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v8.4s, v2.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v9.4s, v2.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v10.4s, v3.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fcmge v10.4s, v3.4s, %[vzero].4s \n" /* vcgeq_f32 */ \
"fmul v11.4s, v3.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"bif v0.16b, v5.16b, v4.16b \n" /* choose*/ \
"bif v1.16b, v7.16b, v6.16b \n" /* choose*/ \
......
此差异已折叠。
......@@ -17,23 +17,26 @@
#include <cmath>
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
// TODO(xxx): fixme now only support transA = false
bool sgemv(const float* A,
const float* x,
float* y,
bool sgemv(const float *A,
const float *x,
float *y,
bool transA,
int M,
int N,
bool is_bias,
const float* bias,
bool is_relu,
const ARMContext* ctx);
const float *bias,
bool flag_act,
lite_api::ActivationType act,
const ARMContext *ctx,
float six = 6.f,
float alpha = 1.f);
} // namespace math
} // namespace arm
......
......@@ -72,7 +72,7 @@ inline CL_DTYPE activation(CL_DTYPE in
CL_DTYPE prelu_alpha
#endif
) {
CL_DTYPE output;
CL_DTYPE output = in;
#ifdef PRELU
output = select(prelu_alpha * in, in, in >= (CL_DTYPE)0);
#endif
......@@ -80,6 +80,10 @@ inline CL_DTYPE activation(CL_DTYPE in
#ifdef RELU
output = fmax(in, (CL_DTYPE)0);
#endif
#ifdef RELU6
output = clamp(in, (CL_DTYPE)0, (CL_DTYPE)6);
#endif
return output;
}
......@@ -89,7 +93,7 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
CL_DTYPE4 prelu_alpha
#endif
) {
CL_DTYPE4 output;
CL_DTYPE4 output = in;
#ifdef PRELU
output = select(prelu_alpha * in, in, in >= (CL_DTYPE4)0.0);
#endif
......@@ -97,5 +101,9 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
#ifdef RELU
output = fmax(in, (CL_DTYPE4)0);
#endif
#ifdef RELU6
output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6);
#endif
return output;
}
......@@ -95,9 +95,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation_type4(output);
#endif
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold){
const int x = get_global_id(0);
const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
}
......@@ -42,8 +42,6 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
int stride = param.strides[0];
int threads = ctx.threads();
bool pads_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
int chin = param.x->dims()[1];
int hin = param.x->dims()[2];
int win = param.x->dims()[3];
......@@ -51,28 +49,28 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
int hout = param.output->dims()[2];
int wout = param.output->dims()[3];
bool pads_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
bool kps_equal = (param.strides[0] == param.strides[1]) && (kw == kh);
bool ks_equal = (param.strides[0] == param.strides[1]) && (kw == kh);
bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
bool flag_dw_3x3 = (kw == 3 && kh == 3 && (stride == 1 || stride == 2));
bool flag_dw_5x5 = (paddings[0] == paddings[2]) &&
((kw == 5 && stride == 1) || (kw == 5 && stride == 2));
bool flag_dw_3x3 = (kw == 3) && (kh == 3) && (stride == 1 || stride == 2);
bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2);
bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
/// select conv impl
if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
/// dw conv impl
if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) {
impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
// VLOG(3) << "invoking dw conv";
} else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
} else if (param.groups == 1 && kw == 3 && stride == 1 && ks_equal &&
no_dilation && pads_all_equal) {
/// winograd conv impl
// TODO(MyPandaShaoxiang): winograd conv support any pad
impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
// VLOG(3) << "invoking winograd conv";
} else if (param.groups == 1 && kw == 3 && stride == 2 &&
chin * chout < 4 * hin * win && kps_equal && no_dilation) {
/// direct conv impl
chin * chout < 4 * hin * win && ks_equal && no_dilation) {
impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
// VLOG(3) << "invoking direct conv";
} else {
......@@ -109,7 +107,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1);
bool flag_dw_5x5 = pads_all_equal && (kw == 5 && kh == 5 && sw == 1);
bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
......@@ -154,7 +152,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1);
bool flag_dw_5x5 = pads_all_equal && (kw == 5 && kh == 5 && sw == 1);
bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
......
......@@ -52,7 +52,10 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
} else if (kw == 5) {
// VLOG(5) << "invoke 5x5 dw conv fp32";
if (param.strides[0] == 2) { // conv5x5s2_dw
auto strides = param.strides;
if ((strides[0] == 1 && strides[1] == 1) ||
(strides[0] == 2 && strides[1] == 2)) {
// trans weights
constexpr int cblock = 4;
auto oc = w_dims[0];
auto kh = w_dims[2];
......@@ -63,10 +66,11 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
lite::arm::math::conv_trans_weights_numc(
w_data_in, w_data, oc, 1, cblock, kh * kw);
flag_trans_weights_ = true;
impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
} else {
flag_trans_weights_ = false;
LOG(FATAL)
<< "5x5 depthwise conv only support stride == 1 or stride == 2";
}
impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
} else {
LOG(FATAL) << "this type dw conv not impl";
}
......
......@@ -93,9 +93,11 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
if (flag_trans_bias_) {
b_data = bias_.data<float>();
}
bool flag_relu = false;
bool flag_act = false;
lite_api::ActivationType act;
if (param.activation_type == "relu") {
flag_relu = true;
act = lite_api::ActivationType::kRelu;
flag_act = true;
}
if (flag_gemm_) {
operators::ActivationParam act_param;
......@@ -119,7 +121,7 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
&ctx);
if (param.bias) {
CHECK_EQ(param.bias->numel(), n_);
lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_, flag_relu);
lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_, flag_act);
}
} else {
for (int i = 0; i < m_; ++i) {
......@@ -133,7 +135,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
k_,
param.bias != nullptr,
b_data,
flag_relu,
flag_act,
act,
&ctx);
}
}
......
......@@ -233,8 +233,17 @@ void MatMulCompute::Run() {
int ldb = n_;
int ldc = n_;
if (n_ == 1) {
lite::arm::math::sgemv(
x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
lite::arm::math::sgemv(x_data,
y_data,
o_data,
false,
m_,
k_,
false,
nullptr,
false,
lite_api::ActivationType::kIndentity,
&ctx);
if (fabsf(alpha - 1.f) > 1e-8f) {
for (size_t i = 0; i < param.Out->dims().production(); ++i) {
o_data[i] *= alpha;
......
......@@ -50,8 +50,17 @@ void MulCompute::Run() {
k_ = x_w;
auto& ctx = this->ctx_->template As<ARMContext>();
if (n_ == 1) {
lite::arm::math::sgemv(
x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
lite::arm::math::sgemv(x_data,
y_data,
o_data,
false,
m_,
k_,
false,
nullptr,
false,
lite_api::ActivationType::kIndentity,
&ctx);
} else {
constexpr bool is_tranposed_y = false;
......
......@@ -70,9 +70,12 @@ void ConvCompute::PrepareForRun() {
kernel_func_names_.push_back("gemm_batch");
kernel_func_paths_.push_back("buffer/fc_kernel.cl");
if (relu_fused) {
build_options_.push_back("-DCL_DTYPE=float -DRELU");
build_options_.push_back("-DCL_DTYPE_float -DRELU");
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_.push_back("-DCL_DTYPE_float -DRELU6");
} else {
build_options_.push_back("-DCL_DTYPE=float");
build_options_.push_back("-DCL_DTYPE_float");
}
impl_ = &ConvCompute::Conv2d1x1;
} else if (pad_equal) {
......@@ -80,11 +83,14 @@ void ConvCompute::PrepareForRun() {
kernel_func_names_.push_back("gemm_batch");
kernel_func_paths_.push_back("buffer/im2col_kernel.cl");
kernel_func_paths_.push_back("buffer/fc_kernel.cl");
build_options_.push_back("-DCL_DTYPE=float");
build_options_.push_back("-DCL_DTYPE_float");
if (relu_fused) {
build_options_.push_back("-DCL_DTYPE=float -DRELU");
build_options_.push_back("-DCL_DTYPE_float -DRELU");
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_.push_back("-DCL_DTYPE_float -DRELU6");
} else {
build_options_.push_back("-DCL_DTYPE=float");
build_options_.push_back("-DCL_DTYPE_float");
}
impl_ = &ConvCompute::GemmlikeConv2d;
col_buffer_.reset(new lite::Tensor);
......
......@@ -46,7 +46,7 @@ static void conv_basic(const Dtype1* din,
int pad_w,
int pad_h,
bool flag_bias,
bool flag_relu) {
std::string flag_relu) {
Dtype2 beta = 0;
auto src_data = din;
auto dst_data_ref = dout;
......@@ -96,10 +96,15 @@ static void conv_basic(const Dtype1* din,
}
}
}
if (flag_relu) {
if (flag_relu == "relu") {
dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
? dst_data_ref[out_idx]
: (Dtype2)0;
} else if (flag_relu == "relu6") {
auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0)
? dst_data_ref[out_idx]
: (Dtype2)0;
dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f;
}
}
}
......@@ -186,7 +191,7 @@ TEST(conv2d, compute_conv2d_1x1) {
/*int iw = ih;*/ for (int iw = 1; iw < 10; iw += 1) { // iw
for (int ic = 1; ic < 10; ic += 1) { // k
for (bool bias_flag : {true /*, false*/}) {
for (bool relu_flag : {true /*, false*/}) {
for (std::string relu_flag : {"relu" /*, "relu6", "None"*/}) {
#else
// groups:1 stride_h:1 stride_w:1 pad_h:0 pad_w:0 kernel_h:1 kernel_h:1
// x_dims:1 32 112 112
......@@ -229,7 +234,16 @@ TEST(conv2d, compute_conv2d_1x1) {
std::vector<int> paddings = {pad, pad, pad, pad};
param.groups = group;
std::vector<int> dilations = {dilation, dilation};
param.fuse_relu = relu_flag;
if (relu_flag == "relu") {
param.fuse_relu = true;
} else if (relu_flag == "None") {
param.fuse_relu = false;
} else if (relu_flag == "relu6") {
param.activation_param.Relu_clipped_coef = 6.f;
param.activation_param.has_active = true;
param.activation_param.active_type =
lite_api::ActivationType::kRelu6;
}
param.paddings = std::make_shared<std::vector<int>>(paddings);
param.dilations = std::make_shared<std::vector<int>>(dilations);
......@@ -390,7 +404,7 @@ TEST(conv2d, compute_conv2d_1x1) {
#undef PRINT_RESULT
// #define PRINT_RESULT
#define LOOP_TEST
// #define LOOP_TEST
TEST(conv2d, compute_conv2d_gemm) {
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
......@@ -411,7 +425,7 @@ TEST(conv2d, compute_conv2d_gemm) {
for (int iw = 1; iw < 10; iw += 1) { // iw
for (int ic = 1; ic < 10; ic += 1) { // k
for (bool bias_flag : {true, false}) {
for (bool relu_flag : {true, false}) {
for (std::string relu_flag : {"relu", "relu6", "None"}) {
#else
const int batch_size = 8;
......@@ -420,7 +434,8 @@ TEST(conv2d, compute_conv2d_gemm) {
const int iw = 224;
const int ic = 3;
const bool bias_flag = true;
const bool relu_flag = true;
const std::string relu_flag =
"relu6"; // "relu", "relu6", "None"
#endif
const int oh = (ih + 2 * pad - ksize) / stride + 1;
......@@ -458,7 +473,16 @@ TEST(conv2d, compute_conv2d_gemm) {
std::vector<int> paddings = {pad, pad, pad, pad};
param.groups = group;
std::vector<int> dilations = {dilation, dilation};
param.fuse_relu = relu_flag;
if (relu_flag == "relu") {
param.fuse_relu = true;
} else if (relu_flag == "None") {
param.fuse_relu = false;
} else if (relu_flag == "relu6") {
param.activation_param.Relu_clipped_coef = 6.f;
param.activation_param.has_active = true;
param.activation_param.active_type =
lite_api::ActivationType::kRelu6;
}
param.paddings = std::make_shared<std::vector<int>>(paddings);
param.dilations = std::make_shared<std::vector<int>>(dilations);
......
......@@ -39,6 +39,9 @@ class DepthwiseConv2dCompute
const auto& param = *param_.get_mutable<param_t>();
if (param.fuse_relu) {
build_options_ += " -DRELU";
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_ += " -DRELU6";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
......@@ -116,7 +119,7 @@ class DepthwiseConv2dCompute
private:
std::string kernel_func_name_{"depthwise_conv2d"};
std::string build_options_{"-DCL_DTYPE=float"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......@@ -135,6 +138,9 @@ class DepthwiseConv2dComputeFP16Image
const auto& param = *param_.get_mutable<param_t>();
if (param.fuse_relu) {
build_options_ += " -DRELU";
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_ += " -DRELU6";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
......@@ -252,6 +258,9 @@ class DepthwiseConv2d3x3s1ComputeFP16Image
const auto& param = *param_.get_mutable<param_t>();
if (param.fuse_relu) {
build_options_ += " -DRELU";
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_ += " -DRELU6";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
......@@ -360,6 +369,9 @@ class DepthwiseConv2dBasicComputeFP32Image
has_bias && param.output->dims() == param.bias->dims();
if (param.fuse_relu) {
build_options_ += " -DRELU";
} else if (param.activation_param.active_type ==
lite_api::ActivationType::kRelu6) {
build_options_ += " -DRELU6";
}
if (has_bias) {
build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
......
......@@ -220,12 +220,158 @@ class ReluComputeFP16ImageDefault
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class Relu6ComputeFloatImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFloat";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/relu6_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<float, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_float -DRELU6"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
class Relu6ComputeFP16ImageDefault
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ActivationParam;
std::string doc() const override {
return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/relu6_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_buf = param.X->data<int16_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf = param.Out->mutable_data<int16_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto threshold = param.Relu_clipped_coef;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold);
CL_CHECK_FATAL(status);
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
VLOG(4) << "threshold:" << threshold;
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context.cl_context()->GetCommandQueue().finish();
}
private:
std::string kernel_func_name_{"relu6"};
std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
// REGISTER_LITE_KERNEL(relu,
// REGISTER_LITE_KERNEL(relu,`
// kOpenCL,
// kFloat,
// kNCHW,
......@@ -267,3 +413,38 @@ REGISTER_LITE_KERNEL(relu,
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Relu6
REGISTER_LITE_KERNEL(
relu6,
kOpenCL,
kFloat,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeFloatImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(
relu6,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::Relu6ComputeFP16ImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
......@@ -23,9 +23,21 @@ namespace paddle {
namespace lite {
template <typename dtype>
void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) {
void relu_compute_ref(const dtype *x_data,
const DDim &x_dim,
dtype *out_data,
float threshold = 0.f) {
if (abs(threshold) < 1e-5) {
// relu
for (int i = 0; i < x_dim.production(); ++i) {
out_data[i] = x_data[i] > 0.f ? x_data[i] : 0.f;
out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
}
} else {
// relu6 or relu with threshold
for (int i = 0; i < x_dim.production(); ++i) {
auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
}
}
}
......@@ -252,7 +264,7 @@ TEST(relu_image2d_fp16, compute) {
"layout(img2buf) "
"-> host";
#ifdef LOOP_TEST
#ifdef RELU_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) {
......@@ -262,7 +274,7 @@ TEST(relu_image2d_fp16, compute) {
const int c = 2;
const int h = 3;
const int w = 4;
#endif // LOOP_TEST
#endif // RELU_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
......@@ -367,13 +379,13 @@ TEST(relu_image2d_fp16, compute) {
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
// result
#ifdef PRINT_RESULT
#ifdef RELU_FP16_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // PRINT_RESULT
#endif // RELU_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
......@@ -391,7 +403,321 @@ TEST(relu_image2d_fp16, compute) {
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef LOOP_TEST
#ifdef RELU_FP16_LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
// #define RELU6_FP32_LOOP_TEST
// #define RELU6_FP32_PRINT_RESULT
TEST(relu6_image2d_fp32, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
"layout(img2buf) "
"-> host";
#ifdef RELU6_FP32_LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // RELU6_FP32_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto relu_img_kernels =
KernelRegistry::Global().Create("relu6",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(relu_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto relu_img_kernel = std::move(relu_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> relu_in
// relu(img): relu_in -> relu_out
// layout(img->buf): relu_out -> y
lite::Tensor x, y, relu_in, relu_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &relu_in;
ImageToBufferParam.x = &relu_out;
ImageToBufferParam.y = &y;
operators::ActivationParam ReluParam;
ReluParam.X = &relu_in;
ReluParam.Out = &relu_out;
ReluParam.Relu_clipped_coef = 6.f;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y.Resize(x_dim);
relu_in.Resize(x_dim);
relu_out.Resize(x_dim);
y_ref.Resize(x_dim);
auto relu_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
mapped_y[i] = static_cast<int>(0);
}
auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
relu_img_kernel->SetParam(ReluParam);
std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_img_context->As<OpenCLContext>()));
relu_img_kernel->SetContext(std::move(relu_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: relu_img_kernel";
relu_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
// result
#ifdef RELU6_FP32_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // RELU6_FP32_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef RELU6_FP32_LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
// #define RELU6_FP16_LOOP_TEST
// #define RELU6_FP16_PRINT_RESULT
TEST(relu6_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
"layout(img2buf) "
"-> host";
#ifdef RELU6_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // RELU6_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
auto relu_img_kernels =
KernelRegistry::Global().Create("relu6",
TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(relu_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
auto relu_img_kernel = std::move(relu_img_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
// layout(buf->img): x -> relu_in
// relu(img): relu_in -> relu_out
// layout(img->buf): relu_out -> y
lite::Tensor x, y, relu_in, relu_out, y_ref;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &relu_in;
ImageToBufferParam.x = &relu_out;
ImageToBufferParam.y = &y;
operators::ActivationParam ReluParam;
ReluParam.X = &relu_in;
ReluParam.Out = &relu_out;
ReluParam.Relu_clipped_coef = 6.f;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y.Resize(x_dim);
relu_in.Resize(x_dim);
relu_out.Resize(x_dim);
y_ref.Resize(x_dim);
auto relu_image2d_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
mapped_y[i] = static_cast<int>(0);
}
auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
relu_image2d_shape["width"], relu_image2d_shape["height"]);
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
relu_img_kernel->SetParam(ReluParam);
std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(relu_img_context->As<OpenCLContext>()));
relu_img_kernel->SetContext(std::move(relu_img_context));
// run kernels
LOG(INFO) << "run kernel: buf_to_img_kernel";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: relu_img_kernel";
relu_img_kernel->Launch();
LOG(INFO) << "run kernel: img_to_buf_kernel";
img_to_buf_kernel->Launch();
// compute ref cpu
relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
// result
#ifdef RELU6_FP16_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
<< std::endl;
}
#endif // RELU6_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", y_data_ref["
<< eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
<< eidx << "]:" << mapped_y[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef RELU6_FP16_LOOP_TEST
} // w
} // h
} // c
......@@ -414,3 +740,7 @@ USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
// relu image2d fp16
USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
// relu6 image2d fp32
USE_LITE_KERNEL(relu6, kOpenCL, kFloat, kImageDefault, ImageDefault);
USE_LITE_KERNEL(relu6, kOpenCL, kFP16, kImageDefault, ImageDefault);
......@@ -130,7 +130,7 @@ class FcOPTest : public arena::TestCase {
1.f,
0.f,
true,
flag_bias,
static_cast<int>(flag_bias),
false);
} else {
basic_gemm(false,
......
......@@ -46,14 +46,19 @@ DEFINE_int32(out_channel, 32, "output channel");
DEFINE_int32(group, 1, "group");
DEFINE_int32(kernel_h, 3, "kernel height");
DEFINE_int32(kernel_w, 3, "kernel width");
DEFINE_int32(pad_h, 1, "pad height");
DEFINE_int32(pad_w, 1, "pad width");
DEFINE_int32(pad_h0, 1, "pad top");
DEFINE_int32(pad_h1, 1, "pad bottom");
DEFINE_int32(pad_w0, 1, "pad left");
DEFINE_int32(pad_w1, 1, "pad right");
DEFINE_int32(stride_h, 1, "stride height");
DEFINE_int32(stride_w, 1, "stride width");
DEFINE_int32(dila_h, 1, "dilation height");
DEFINE_int32(dila_w, 1, "dilation width");
DEFINE_bool(flag_relu, true, "do relu");
DEFINE_int32(flag_act,
0,
"do activation"); // 0-no act, 1-relu, 2-relu6, 4-leakyrelu
DEFINE_double(leakey_relu_alpha, 1.0, "leakey relu alpha");
DEFINE_bool(flag_bias, true, "with bias");
typedef paddle::lite::DDim DDim;
......@@ -98,9 +103,10 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
const std::vector<int>& pads,
const std::vector<int>& dilas,
bool flag_bias,
bool flag_relu,
int flag_act,
const std::vector<int>& thread_num,
const std::vector<int>& power_mode) {
const std::vector<int>& power_mode,
const float leakey_relu_scale) {
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
......@@ -118,13 +124,20 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
param.strides = strides;
param.paddings = std::make_shared<std::vector<int>>(pads);
param.dilations = std::make_shared<std::vector<int>>(dilas);
param.fuse_relu = flag_relu;
param.groups = group;
if (flag_relu) {
const float six = 6.f;
if (flag_act > 0) {
ActivationParam act_param;
act_param.has_active = true;
act_param.active_type =
(paddle::lite_api::ActivationType)1; // 2-relu6 4-leakyrelu
act_param.active_type = (paddle::lite_api::ActivationType)
flag_act; // 1-relu, 2-relu6, 4-leakyrelu
if (flag_act == 1) {
param.fuse_relu = true;
} else if (flag_act == 2) {
act_param.Relu_clipped_coef = six;
} else if (flag_act == 4) {
act_param.Leaky_relu_alpha = leakey_relu_scale;
}
param.activation_param = act_param;
}
......@@ -205,7 +218,9 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
pads[2],
pads[0],
flag_bias,
flag_relu);
flag_act,
six,
leakey_relu_scale);
}
/// warm up
for (int i = 0; i < FLAGS_warmup; ++i) {
......@@ -254,22 +269,20 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", group: " << group
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", relu: " << (flag_relu ? "true" : "false")
<< ", threads: " << th << ", power_mode: " << cls
<< " failed!!\n";
<< ", act: " << flag_act << ", threads: " << th
<< ", power_mode: " << cls << " failed!!\n";
}
}
}
LOG(INFO) << "test fp32 conv: input: " << dim_in
<< ", output: " << dim_out << ", weight dim: " << weight_dim
<< ", pad: " << pads[0] << ", " << pads[1]
<< ", stride: " << strides[0] << ", " << strides[1]
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
<< ", " << pads[3] << ", stride: " << strides[0] << ", "
<< strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", group: " << group
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", relu: " << (flag_relu ? "true" : "false")
<< ", threads: " << th << ", power_mode: " << cls
<< " successed!!\n";
<< ", act: " << flag_act << ", threads: " << th
<< ", power_mode: " << cls << " successed!!\n";
}
}
}
......@@ -287,12 +300,14 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
const std::vector<int>& pads,
const std::vector<int>& dilas,
bool flag_bias,
bool flag_relu,
int flag_act,
const std::vector<int>& thread_num,
const std::vector<int>& power_mode) {}
const std::vector<int>& power_mode,
const float leakey_relu_scale) {}
#endif // LITE_WITH_ARM
#if 1 /// 3x3dw
// TODO(chenjiaoAngel): fix me, diff: 3x3 depthwise conv
#if 0 /// 3x3dw
TEST(TestConv3x3DW, test_conv3x3_depthwise) {
if (FLAGS_basic_test) {
for (auto& stride : {1, 2}) {
......@@ -301,7 +316,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
for (auto& pad_top : {0, 1, 2}) {
for (auto& pad_bottom : {0, 1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) {
for (auto& c : {1, 3, 5, 8, 16, 32}) {
std::vector<DDim> dims;
DDim weights_dim({c, 1, 3, 3});
......@@ -310,6 +325,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
dims.push_back(DDim({batch, c, h, h}));
}
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(dims,
weights_dim,
c,
......@@ -317,9 +333,10 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_relu,
flag_act,
{1, 2, 4},
{FLAGS_power_mode});
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
......@@ -335,28 +352,41 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
#if 1 /// 5x5dw
TEST(TestConv5x5DW, test_conv5x5_depthwise) {
if (FLAGS_basic_test) {
#ifdef __aarch64__
// TODO(chenjiaoAngel): fix me, diff: arm64 5x5s2 depthwise conv
for (auto& stride : {1}) {
#else
for (auto& stride : {1, 2}) {
for (auto& pad : {0, 1, 2}) {
#endif
for (auto& pad_left : {0, 1, 2}) {
for (auto& pad_right : {0, 1, 2}) {
for (auto& pad_top : {0, 1, 2}) {
for (auto& pad_bottom : {0, 1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
for (auto& c : {1, 3, 5, 8, 16, 32}) {
for (auto& flag_act : {0, 1, 2, 4}) {
for (auto& c : {1, 15, 32}) {
std::vector<DDim> dims;
DDim weights_dim({c, 1, 5, 5});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
for (auto& h : {1, 3, 15, 56}) {
dims.push_back(DDim({batch, c, h, h}));
}
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(dims,
weights_dim,
c,
{stride, stride},
{pad, pad, pad, pad},
{pad_left, pad_right, pad_top, pad_bottom},
{1, 1},
flag_bias,
flag_relu,
{1, 2, 4},
{FLAGS_power_mode});
flag_act,
{4},
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
}
}
}
......@@ -373,7 +403,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
for (auto& cout : {1, 5, 16, 37}) {
for (auto& g : {1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) {
std::vector<DDim> dims;
if (cin % g != 0 || cout % g != 0) {
continue;
......@@ -384,6 +414,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(dims,
weights_dim,
g,
......@@ -391,9 +422,10 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
{0, 0, 0, 0},
{1, 1},
flag_bias,
flag_relu,
flag_act,
{1, 2, 4},
{FLAGS_power_mode});
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
......@@ -403,24 +435,29 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
}
#endif /// conv1x1s1
#if 1 /// conv3x3s1
// TODO(MyPandaShaoxiang): fix me, diff: 3x3s1 winograd
#if 0 /// conv3x3s1
TEST(TestConv3x3s1, test_conv_3x3s1) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 32, 48}) {
for (auto& cout : {1, 5, 8, 32, 48}) {
for (auto& pad_left : {1, 2}) {
for (auto& pad_right : {1, 2}) {
for (auto& pad_top : {1, 2}) {
for (auto& pad_bottom : {1, 2}) {
for (auto& cin : {1, 3, 8, 8}) {
for (auto& cout : {1, 5, 32, 48}) {
for (auto& pad_left : {0, 1, 2}) {
for (auto& pad_right : {0, 1, 2}) {
for (auto& pad_top : {0, 1, 2}) {
for (auto& pad_bottom : {0, 1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) {
std::vector<DDim> dims;
DDim weights_dim({cout, cin, 3, 3});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 7, 19, 56, 32}) {
for (auto& h : {1, 3, 17, 33}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
if (cin == 1 && cout ==1) {
continue;
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(dims,
weights_dim,
1,
......@@ -428,9 +465,10 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_relu,
{1, 2, 4},
{FLAGS_power_mode});
flag_act,
{4},
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
......@@ -446,21 +484,25 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
#if 1 /// conv3x3s2
TEST(TestConv3x3s2, test_conv_3x3s2) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 32}) {
for (auto& cout : {1, 5, 8, 32}) {
for (auto& pad_left : {1, 2}) {
for (auto& pad_right : {1, 2}) {
for (auto& pad_top : {1, 2}) {
for (auto& pad_bottom : {1, 2}) {
for (auto& cin : {1, 3, 8}) {
for (auto& cout : {1, 3, 9, 32}) {
for (auto& pad_left : {0, 1, 2}) {
for (auto& pad_right : {0, 1, 2}) {
for (auto& pad_top : {0, 1, 2}) {
for (auto& pad_bottom : {0, 1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) {
std::vector<DDim> dims;
DDim weights_dim({cout, cin, 3, 3});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
for (auto& h : {3, 7, 15, 56, 32}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
if (cin == 1 && cout == 1) {
continue;
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(dims,
weights_dim,
1,
......@@ -468,9 +510,10 @@ TEST(TestConv3x3s2, test_conv_3x3s2) {
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_relu,
flag_act,
{1, 2, 4},
{FLAGS_power_mode});
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
......@@ -486,29 +529,40 @@ TEST(TestConv3x3s2, test_conv_3x3s2) {
#if 1 /// random param conv
TEST(TestConvRand, test_conv_rand) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 16}) {
for (auto& cout : {1, 5, 8, 16}) {
for (auto& cin : {1, 3, 8}) {
for (auto& cout : {1, 5, 16}) {
for (auto& g : {1, 2}) {
for (auto& kw : {1, 2, 3}) {
for (auto& kh : {1, 2, 3}) {
for (auto& stride : {1, 2}) {
for (auto& pad_left : {0, 1, 2}) {
for (auto& pad_right : {0, 1, 2}) {
for (auto& pad_top : {0, 1, 2}) {
for (auto& pad_bottom : {0, 1, 2}) {
for (auto& pad_left : {0, 2}) {
for (auto& pad_right : {0, 2}) {
for (auto& pad_top : {0, 2}) {
for (auto& pad_bottom : {0, 2}) {
for (auto& dila : {1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) {
if (cin % g != 0 || cout % g != 0) {
continue;
}
std::vector<DDim> dims;
DDim weights_dim({cout, cin / g, kh, kw});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 19, 32, 28}) {
for (auto& h : {1, 3, 19, 32}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
// skip 3x3 depthwise conv
if (g == cin && cin == cout && kw == 3 &&
kh == 3) {
break;
}
// skip 3x3s1 direct conv
if (g == 1 && (cin != 1 || cout != 1) &&
kw == 3 && kh == 3 && stride == 1) {
break;
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(
dims,
weights_dim,
......@@ -517,9 +571,10 @@ TEST(TestConvRand, test_conv_rand) {
{pad_top, pad_bottom, pad_left, pad_right},
{dila, dila},
flag_bias,
flag_relu,
{1, 2, 4},
{FLAGS_power_mode});
flag_act,
{4},
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
......@@ -551,11 +606,12 @@ TEST(TestConvCustom, test_conv_fp32_custom_size) {
FLAGS_kernel_w}),
FLAGS_group,
{FLAGS_stride_h, FLAGS_stride_w},
{FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
{FLAGS_pad_h0, FLAGS_pad_h1, FLAGS_pad_w0, FLAGS_pad_w1},
{FLAGS_dila_h, FLAGS_dila_w},
FLAGS_flag_bias,
FLAGS_flag_relu,
FLAGS_flag_act,
{FLAGS_threads},
{FLAGS_power_mode});
{FLAGS_power_mode},
FLAGS_leakey_relu_alpha);
}
#endif // custom
......@@ -291,7 +291,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
pads[2],
pads[0],
flag_bias,
flag_relu);
static_cast<int>(flag_relu));
paddle::lite::arm::math::fp32_to_int8(dout_basic_fp32,
dout_basic_int8,
scale_out.data(),
......@@ -362,6 +362,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
<< pads[2] << ", " << pads[3]
<< ", stride: " << strides[0] << ", " << strides[1]
<< ", dila_: " << dilas[0] << ", " << dilas[1]
<< ", group: " << group
<< ", bias: " << (flag_bias ? "true" : "false")
<< ", relu: " << (flag_relu ? "true" : "false")
<< ", threads: " << th << ", power_mode: " << cls
......@@ -467,7 +468,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
std::vector<DDim> dims;
DDim weights_dim({c, 1, 3, 3});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 15, 19, 75, 32, 28}) {
for (auto& h : {1, 3, 15, 33}) {
dims.push_back(DDim({batch, c, h, h}));
}
}
......@@ -479,7 +480,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
{1, 1},
flag_bias,
flag_relu,
{1, 2, 4},
{4},
{FLAGS_power_mode});
}
}
......@@ -494,14 +495,14 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
if (FLAGS_basic_test) {
for (auto& stride : {1}) {
for (auto& pad : {0, 1, 2}) {
for (auto& pad : {0, 1, 2, 3, 4}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
for (auto& c : {1, 3, 5, 8, 16, 32}) {
for (auto& c : {1, 5, 15, 33}) {
std::vector<DDim> dims;
DDim weights_dim({c, 1, 5, 5});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
for (auto& h : {1, 3, 15, 33}) {
dims.push_back(DDim({batch, c, h, h}));
}
}
......@@ -513,7 +514,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
{1, 1},
flag_bias,
flag_relu,
{1, 2, 4},
{4},
{FLAGS_power_mode});
}
}
......@@ -527,8 +528,8 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
#if 1 /// conv1x1s1
TEST(TestConv1x1s1Int8, test_conv1x1s1) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 11, 32}) {
for (auto& cout : {1, 5, 16, 37}) {
for (auto& cin : {1, 3, 8, 32}) {
for (auto& cout : {1, 5, 17}) {
for (auto& g : {1, 2}) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
......@@ -538,7 +539,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
}
DDim weights_dim({cout, cin / g, 1, 1});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 7, 19, 28, 32, 56, 1}) {
for (auto& h : {1, 9, 16, 33}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
......@@ -550,7 +551,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
{1, 1},
flag_bias,
flag_relu,
{1, 2, 4},
{4},
{FLAGS_power_mode});
}
}
......@@ -564,8 +565,8 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
#if 1 /// conv3x3s1
TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 32, 48}) {
for (auto& cout : {1, 5, 8, 32, 48}) {
for (auto& cin : {1, 3, 8, 33}) {
for (auto& cout : {1, 5, 33}) {
for (auto& pad_top : {1, 2}) {
for (auto& pad_bottom : {1, 2}) {
for (auto& pad_left : {1, 2}) {
......@@ -575,7 +576,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
std::vector<DDim> dims;
DDim weights_dim({cout, cin, 3, 3});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 7, 19, 56, 32}) {
for (auto& h : {1, 7, 17, 33}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
......@@ -587,7 +588,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
{1, 1},
flag_bias,
flag_relu,
{1, 2, 4},
{4},
{FLAGS_power_mode});
}
}
......@@ -604,8 +605,8 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
#if 1 /// conv3x3s2
TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 32}) {
for (auto& cout : {1, 5, 8, 32}) {
for (auto& cin : {1, 3, 31}) {
for (auto& cout : {1, 5, 33}) {
for (auto& pad_top : {1, 2}) {
for (auto& pad_bottom : {1, 2}) {
for (auto& pad_left : {1, 2}) {
......@@ -615,7 +616,7 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
std::vector<DDim> dims;
DDim weights_dim({cout, cin, 3, 3});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
for (auto& h : {1, 7, 19, 33}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
......@@ -627,7 +628,7 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
{1, 1},
flag_bias,
flag_relu,
{1, 2, 4},
{4},
{FLAGS_power_mode});
}
}
......@@ -644,8 +645,8 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
#if 1 /// random param conv
TEST(TestConvRandInt8, test_conv_rand) {
if (FLAGS_basic_test) {
for (auto& cin : {1, 3, 8, 16}) {
for (auto& cout : {1, 5, 8, 16}) {
for (auto& cin : {1, 17}) {
for (auto& cout : {1, 8, 17}) {
for (auto& g : {1, 2}) {
for (auto& kw : {1, 2, 3}) {
for (auto& kh : {1, 2, 3}) {
......@@ -658,12 +659,12 @@ TEST(TestConvRandInt8, test_conv_rand) {
for (auto& flag_bias : {false, true}) {
for (auto& flag_relu : {false, true}) {
if (cin % g != 0 || cout % g != 0) {
continue;
break;
}
std::vector<DDim> dims;
DDim weights_dim({cout, cin / g, kh, kw});
for (auto& batch : {1, 2}) {
for (auto& h : {1, 3, 19, 32, 28}) {
for (auto& h : {1, 3, 5, 19}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
......@@ -676,7 +677,7 @@ TEST(TestConvRandInt8, test_conv_rand) {
{dila, dila},
flag_bias,
flag_relu,
{1, 2, 4},
{4},
{FLAGS_power_mode});
}
}
......
......@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(basic_test, true, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(M, 512, "gemv: M");
......
......@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(basic_test, true, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(M, 512, "gemm_c4: M");
......
......@@ -38,11 +38,19 @@ DEFINE_int32(K, 512, "sgemv: K");
DEFINE_bool(traA, false, "gemv: A transpose");
DEFINE_bool(flag_relu, false, "do relu");
DEFINE_int32(flag_act, 0, "do act");
DEFINE_bool(flag_bias, false, "with bias");
bool test_sgemv(
bool tra, int m, int k, bool has_bias, bool has_relu, int cls, int ths) {
DEFINE_double(leakey_relu_alpha, 1.0, "leakey relu alpha");
DEFINE_double(clipped_coef, 6.0, "clipped relu coef");
bool test_sgemv(bool tra,
int m,
int k,
bool has_bias,
int flag_act,
int cls,
int ths,
float six = 6.f,
float alpha = 1.f) {
Tensor ta;
Tensor tb;
Tensor tc;
......@@ -68,8 +76,7 @@ bool test_sgemv(
fill_tensor_rand(tbias, -1.f, 1.f);
LOG(INFO) << "sgemv M: " << m << ", K: " << k
<< ", transA: " << (tra ? "true" : "false")
<< ", relu: " << (has_relu ? "true" : "false")
<< ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
<< ", bias: " << (has_bias ? "true" : "false");
#ifdef LITE_WITH_ARM
......@@ -78,10 +85,29 @@ bool test_sgemv(
auto dc = tc.mutable_data<float>();
auto dc_basic = tc_basic.mutable_data<float>();
auto dbias = tbias.mutable_data<float>();
paddle::lite_api::ActivationType act =
paddle::lite_api::ActivationType::kIndentity;
if (flag_act == 1) {
act = paddle::lite_api::ActivationType::kRelu;
} else if (flag_act == 2) {
act = paddle::lite_api::ActivationType::kRelu6;
} else if (flag_act == 4) {
act = paddle::lite_api::ActivationType::kLeakyRelu;
}
if (FLAGS_check_result) {
basic_gemv(
m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu);
basic_gemv(m,
k,
da,
db,
dbias,
dc_basic,
1.f,
0.f,
tra,
has_bias,
flag_act,
six,
alpha);
}
paddle::lite::profile::Timer t0;
//! compute
......@@ -92,15 +118,37 @@ bool test_sgemv(
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
/// warmup
for (int j = 0; j < FLAGS_warmup; ++j) {
paddle::lite::arm::math::sgemv(
da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
paddle::lite::arm::math::sgemv(da,
db,
dc,
tra,
m,
k,
has_bias,
dbias,
flag_act > 0,
act,
&ctx,
six,
alpha);
}
t0.Reset();
for (int i = 0; i < FLAGS_repeats; ++i) {
t0.Start();
paddle::lite::arm::math::sgemv(
da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
paddle::lite::arm::math::sgemv(da,
db,
dc,
tra,
m,
k,
has_bias,
dbias,
flag_act > 0,
act,
&ctx,
six,
alpha);
t0.Stop();
}
LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls
......@@ -125,7 +173,7 @@ bool test_sgemv(
tensor_diff(tc_basic, tc, tdiff);
LOG(INFO) << "basic result: ";
print_tensor(tc_basic);
LOG(INFO) << "saber result: ";
LOG(INFO) << "lite result: ";
print_tensor(tc);
LOG(INFO) << "diff result: ";
print_tensor(tdiff);
......@@ -144,22 +192,31 @@ TEST(TestLiteSgemv, Sgemv) {
LOG(INFO) << "run basic sgemv test";
for (auto& m : {1, 3, 8, 21, 32, 397}) {
for (auto& k : {1, 3, 8, 17, 59, 234}) {
for (auto& tra : {true, false}) {
for (auto& tra : {false, true}) {
for (auto& has_bias : {false, true}) {
for (auto& has_relu : {false, true}) {
for (auto& flag_act : {0, 1, 2, 4}) {
for (auto& th : {1, 2, 4}) {
auto flag = test_sgemv(
tra, m, k, has_bias, has_relu, FLAGS_cluster, th);
float six = 6.f;
float alpha = 8.88f;
auto flag = test_sgemv(tra,
m,
k,
has_bias,
flag_act,
FLAGS_cluster,
th,
six,
alpha);
if (flag) {
LOG(INFO) << "test m = " << m << ", k=" << k
<< ", bias: " << (has_bias ? "true" : "false")
<< ", relu: " << (has_relu ? "true" : "false")
<< ", flag act: " << flag_act
<< ", trans A: " << (tra ? "true" : "false")
<< ", threads: " << th << " passed\n";
} else {
LOG(FATAL) << "test m = " << m << ", k=" << k
<< ", bias: " << (has_bias ? "true" : "false")
<< ", relu: " << (has_relu ? "true" : "false")
<< ", flag_act: " << flag_act
<< ", trans A: " << (tra ? "true" : "false")
<< ", threads: " << th << " failed\n";
}
......@@ -180,15 +237,17 @@ TEST(TestSgemvCustom, Sgemv_custom) {
FLAGS_M,
FLAGS_K,
FLAGS_flag_bias,
FLAGS_flag_relu,
FLAGS_flag_act,
FLAGS_cluster,
FLAGS_threads);
FLAGS_threads,
FLAGS_clipped_coef,
FLAGS_leakey_relu_alpha);
if (!flag) {
LOG(FATAL) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
<< ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
<< ", relu: " << FLAGS_flag_relu << " failed!!";
<< ", act: " << FLAGS_flag_act << " failed!!";
}
LOG(INFO) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
<< ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
<< ", relu: " << FLAGS_flag_relu << " passed!!";
<< ", act: " << FLAGS_flag_act << " passed!!";
}
......@@ -177,7 +177,9 @@ static void basic_gemv(int m,
type2 beta,
bool trans_a = false,
bool flag_bias = false,
bool flag_relu = false) {
int flag_act = false,
float six = 6.f,
float leakey_relu_alpha = 1.f) {
#pragma omp parallel for
for (int i = 0; i < m; ++i) {
auto bias_data = static_cast<type2>(0);
......@@ -195,8 +197,15 @@ static void basic_gemv(int m,
sum += av * b[j];
}
type2 tmp = alpha * sum + beta * c[i] + bias_data;
if (flag_relu) {
if (flag_act > 0) {
if (flag_act == 1) { // relu
c[i] = tmp > (type2)0 ? tmp : (type2)0;
} else if (flag_act == 2) { // relu 6
c[i] = tmp > (type2)0 ? tmp : (type2)0;
c[i] = c[i] < six ? c[i] : six;
} else if (flag_act == 4) { // leakey relu
c[i] = tmp < (type2)0 ? (type2)(tmp * leakey_relu_alpha) : tmp;
}
} else {
c[i] = tmp;
}
......@@ -230,7 +239,9 @@ static void conv_basic(const Dtype1* din,
int pad_w,
int pad_h,
bool flag_bias,
bool flag_relu) {
int act_type,
float six = 6.f,
float scale = 1.f) {
Dtype2 beta = 0;
auto src_data = din;
auto dst_data_ref = dout;
......@@ -280,10 +291,27 @@ static void conv_basic(const Dtype1* din,
}
}
}
if (flag_relu) {
if (act_type > 0) {
// 1-relu 2-relu6 4-leakyrelu
if (act_type == 1) {
dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
? dst_data_ref[out_idx]
: (Dtype2)0;
} else if (act_type == 2) {
dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
? dst_data_ref[out_idx]
: (Dtype2)0;
dst_data_ref[out_idx] = dst_data_ref[out_idx] < (Dtype2)six
? dst_data_ref[out_idx]
: (Dtype2)six;
} else if (act_type == 4) {
dst_data_ref[out_idx] =
dst_data_ref[out_idx] > (Dtype2)0
? dst_data_ref[out_idx]
: (Dtype2)(dst_data_ref[out_idx] * scale);
} else {
printf("this act type: %d does not support \n", act_type);
}
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册