提交 529f24c2 编写于 作者: H hedaoyuan

cpu cmrnorm

上级 b3f0f3d2
develop 1.8.5 2.0.1-rocm-post 2.4.1 Ligoml-patch-1 OliverLPH-patch-1 OliverLPH-patch-2 PaddlePM-patch-1 PaddlePM-patch-2 ZHUI-patch-1 add_default_att add_kylinv10 add_model_benchmark_ci add_some_yaml_config addfile all_new_design_exec ascendrc ascendrelease bugfix-eval-frame-leakgae cherry-pick-fix-customOP-random-fail cherry_undefined_var compile_windows cp_2.4_fix_numpy delete_2.0.1-rocm-post delete_add_default_att delete_all_new_design_exec delete_ascendrc delete_compile_windows delete_delete_addfile delete_disable_iterable_dataset_unittest delete_fix_dataloader_memory_leak delete_fix_imperative_dygraph_error delete_fix_retry_ci delete_fix_undefined_var delete_improve_sccache delete_incubate/lite delete_paddle_tiny_install delete_paralleltest delete_prv-disable-more-cache delete_revert-31068-fix_conv3d_windows delete_revert-31562-mean delete_revert-33630-bug-fix delete_revert-34159-add_npu_bce_logical_dev delete_revert-34910-spinlocks_for_allocator delete_revert-35069-revert-34910-spinlocks_for_allocator delete_revert-36057-dev/read_flags_in_ut dingjiaweiww-patch-1 disable_iterable_dataset_unittest dy2static enable_eager_model_test final_state_gen_python_c final_state_intermediate fix-numpy-issue fix-run-program-grad-node-mem fix_check fix_concat_slice fix_custom_device_copy_sync fix_dataloader_memory_leak fix_dlpack_for fix_imperative_dygraph_error fix_newexe_gc fix_npu_ci fix_op_flops fix_retry_ci fix_rnn_docs fix_tensor_type fix_undefined_var fix_var_stop_gradient_error fixiscan fixiscan1 fixiscan2 fixiscan3 github/fork/123malin/netifaces github/fork/123malin/tdm_abacus github/fork/AshburnLee/dev_unique github/fork/ForFishes/fix_memory_matmul github/fork/ForFishes/rm_fluid github/fork/LielinJiang/move-2.0-api github/fork/LielinJiang/visual-dl-cb github/fork/LiuChiachi/add-transformer-generate-square-subsequent-mask-api github/fork/LiuChiachi/fix-example-code-for-hapi-Model github/fork/LiuChiachi/remove-input-requirment-in-dygraph-Model github/fork/MrChengmo/fix_ps_profiler github/fork/MrChengmo/update_ps_heter github/fork/PWhiddy/patch-1 github/fork/Shixiaowei02/dev/save_load_upgrade github/fork/TCChenlong/fix_hapi github/fork/TCChenlong/fix_inden github/fork/Thunderbrook/xpu_slice github/fork/XieYunshen/disable_ut_test_parallel_executor_fetch_isolated_var github/fork/XieYunshen/disable_ut_test_parallel_executor_fetch_isolated_var_2 github/fork/XieYunshen/disable_ut_test_parallel_executor_fetch_isolated_var_3 github/fork/XieYunshen/timeout_20S_ut github/fork/ZeyuChen/remove-nltk github/fork/arlesniak/arlesniak/selective__mkldnn_flags github/fork/baiyfbupt/code_doc_mig github/fork/chalsliu/set_timeout github/fork/chen-zhiyu/develop github/fork/chenwhql/ci/try_to_find_test_buffer_shared_memory_reuse_pass_error github/fork/chenwhql/dygraph/remove_scale_loss_and_apply_collective_grads github/fork/chenwhql/saveload/add_get_inference_program github/fork/chenwhql/saveload/remove_save_load_config github/fork/cryoco/pass-compatibility-trt github/fork/danleifeng/isempty_api2.0 github/fork/frankwhzhang/api_transfer github/fork/hbwx24/error_msg/cuda_kernel_error_msg github/fork/heavengate/cherry_yolo_box github/fork/heavengate/update_yolo_box github/fork/iclementine/rnn_fix github/fork/iducn/testestse github/fork/jczaja/prv-25537-fix github/fork/jeff41404/release/1.8 github/fork/jiweibo/api_2.0 github/fork/jiweibo/fix_lite_resnet50_test github/fork/juncaipeng/fix_doc_1 github/fork/lfchener/sample_code github/fork/littletomatodonkey/fix_reg_doc github/fork/liym27/dy2stat_update_assign_to_rc20 github/fork/luotao1/profiler_ut github/fork/mapingshuo/add_wait github/fork/mapingshuo/doc_2.0 github/fork/mapingshuo/zero-0.5 github/fork/miraiwk/dev github/fork/pangyoki/add-Categorical-class-branch github/fork/pangyoki/add-multinomial-op-branch github/fork/pangyoki/fix-test_distritbution-CI github/fork/qjing666/doublegrad github/fork/qjing666/fix_hdfs_download github/fork/sandyhouse/add_gather_etc github/fork/sandyhouse/add_send_recv_alltoall_etc github/fork/sandyhouse/pipeline_exe_run github/fork/seiriosPlus/feature/large_scale_kv_save_delta github/fork/seiriosPlus/fix/paddle_errors_fix github/fork/seiriosPlus/fix/paddle_op_errors github/fork/shangzhizhou/fix_test_activation_op_random_bug github/fork/smallv0221/yxp0924 github/fork/smallv0221/yxp0925 github/fork/swtkiwi/del-matplotlib github/fork/tianshuo78520a/kunlun_test github/fork/tianshuo78520a/update_dockerfile github/fork/wanghaoshuang/bert_fuse github/fork/wanghaoshuang/label_smooth github/fork/wanghuancoder/develop_CUDASynchronize github/fork/wanghuancoder/develop_Layer_doc github/fork/wanghuancoder/develop_ParameterList_doc github/fork/wanghuancoder/develop_Sequential_doc github/fork/wanghuancoder/develop_bilinear_tensor_product github/fork/wanghuancoder/develop_coverage_build_sh github/fork/wanghuancoder/develop_in_dynamic_mode_doc github/fork/wanghuancoder/develop_unique_name_doc github/fork/wangxicoding/fleet_meta_combine github/fork/wawltor/error_message_fix_5 github/fork/willthefrog/remove_l2_norm github/fork/windstamp/momentum_op github/fork/windstamp/mv_op_5 github/fork/windstamp/normal_api github/fork/wojtuss/wojtuss/fusion_gru_quantization github/fork/wojtuss/wojtuss/quantization-with-shift github/fork/wzzju/fix_err_info github/fork/wzzju/pure_fp16 github/fork/xiemoyuan/op_error_message github/fork/xiemoyuan/optimize_error_message github/fork/yaoxuefeng6/fix_doc github/fork/yaoxuefeng6/mod_dataset_v2 github/fork/yongqiangma/lod github/fork/ysh329/fix-clip-by-norm-error github/fork/ysh329/fix-error-clip-by-value github/fork/yukavio/error_info github/fork/zhangting2020/conv_filter_grad github/fork/zhangting2020/is_compile_with_cuda github/fork/zhangting2020/place_doc github/fork/zhangting2020/program github/fork/zhhsplendid/fix_any github/fork/zhhsplendid/refine_api2 github/fork/zhhsplendid/refine_api2_test github/fork/zhhsplendid/refine_api_test_ptb_lm github/fork/zhhsplendid/refine_api_test_resnet github/fork/zhhsplendid/refine_api_test_simnet github/fork/zhiqiu/dev/refine_initializer github/fork/zhiqiu/dev/remove_inplace_argument github/fork/zlsh80826/nvinfer_plugin_var_len_cuda11 hack_event improve_sccache incuabte/new_frl incubate/frl_train_eval incubate/infrt incubate/lite incubate/new_frl incubate/new_frl_rc incubate/stride inplace_addto layer_norm make_flag_adding_easier master matmul_double_grad move_embedding_to_phi move_histogram_to_pten move_sgd_to_phi move_slice_to_pten move_temporal_shift_to_phi move_yolo_box_to_phi npu_fix_alloc numel operator_opt paddle_tiny_install paralleltest pass-compile-eval-frame preln_ernie prv-disable-more-cache prv-md-even-more prv-onednn-2.5 prv-reshape-mkldnn-ut2 pten_tensor_refactor release-deleted/2.5 release-rc/2.5 release/0.10.0 release/0.11.0 release/0.12.0 release/0.13.0 release/0.14.0 release/0.15.0 release/1.0.0 release/1.1 release/1.2 release/1.3 release/1.4 release/1.5 release/1.6 release/1.7 release/1.8 release/2.0 release/2.0-alpha release/2.0-beta release/2.0-rc release/2.0-rc1 release/2.1 release/2.2 release/2.3 release/2.3-fc-ernie-fix release/2.4 release/2.5 release/lite-0.1 release/llm_2.5 revert-24981-add_device_attr_for_regulization revert-26856-strategy_example2 revert-27520-disable_pr revert-31068-fix_conv3d_windows revert-31562-mean revert-32290-develop-hardlabel revert-33037-forci revert-33475-fix_cifar_label_dimension revert-33630-bug-fix revert-34159-add_npu_bce_logical_dev revert-34406-add_copy_from_tensor revert-34910-spinlocks_for_allocator revert-35069-revert-34910-spinlocks_for_allocator revert-36057-dev/read_flags_in_ut revert-36201-refine_fast_threaded_ssa_graph_executor revert-36985-add_license revert-37318-refactor_dygraph_to_eager revert-37926-eager_coreops_500 revert-37956-revert-37727-pylayer_support_tuple revert-38100-mingdong revert-38301-allocation_rearrange_pr revert-38703-numpy_bf16_package_reupload revert-38732-remove_useless_header_in_elementwise_mul_grad revert-38959-Reduce_Grad revert-39143-adjust_empty revert-39227-move_trace_op_to_pten revert-39268-dev/remove_concat_fluid_kernel revert-40170-support_partial_grad revert-41056-revert-40727-move_some_activaion_to_phi revert-41065-revert-40993-mv_ele_floordiv_pow revert-41068-revert-40790-phi_new revert-41944-smaller_inference_api_test revert-42149-do-not-reset-default-stream-for-stream-safe-cuda-allocator revert-43155-fix_ut_tempfile revert-43882-revert-41944-smaller_inference_api_test revert-45808-phi/simplify_size_op revert-46827-deform_comment revert-47325-remove_cudnn_hardcode revert-47645-add_npu_storage_dims revert-48815-set_free_when_no_cache_hit_default_value_true revert-49499-test_ninja_on_ci revert-49654-prim_api_gen revert-49673-modify_get_single_cov revert-49763-fix_static_composite_gen revert-50158-fix_found_inf_bug_for_custom_optimizer revert-50188-refine_optimizer_create_accumulators revert-50335-fix_optminizer_set_auxiliary_var_bug revert-51676-flag_delete revert-51850-fix_softmaxce_dev revert-52175-dev_peak_memory revert-52186-deve revert-52523-test_py38 revert-52912-develop revert-53248-set_cmake_policy revert-54029-fix_windows_compile_bug revert-54068-support_translating_op_attribute revert-54214-modify_cmake_dependencies revert-54370-offline_pslib revert-54391-fix_cmake_md5error revert-54411-fix_cpp17_compile revert-54466-offline_pslib revert-54480-cmake-rocksdb revert-55568-fix_BF16_bug1 revert-56328-new_ir_support_vector_type_place_transfer revert-56366-fix_openssl_bug revert-56545-revert-56366-fix_openssl_bug revert-56620-fix_new_ir_ocr_bug revert-56925-check_inputs_grad_semantic revert-57005-refine_stride_flag rocm_dev_0217 sd_conv_linear_autocast semi-auto/rule-base support-0D-sort support_weight_transpose test_benchmark_ci test_feature_precision_test_c test_for_Filtetfiles test_model_benchmark test_model_benchmark_ci zhiqiu-patch-1 v2.5.1 v2.5.0 v2.5.0-rc1 v2.5.0-rc0 v2.4.2 v2.4.1 v2.4.0 v2.4.0-rc0 v2.3.2 v2.3.1 v2.3.0 v2.3.0-rc0 v2.2.2 v2.2.1 v2.2.0 v2.2.0-rc0 v2.2.0-bak0 v2.1.3 v2.1.2 v2.1.1 v2.1.0 v2.1.0-rc0 v2.0.2 v2.0.1 v2.0.0 v2.0.0-rc1 v2.0.0-rc0 v2.0.0-beta0 v2.0.0-alpha0 v1.8.5 v1.8.4 v1.8.3 v1.8.2 v1.8.1 v1.8.0 v1.7.2 v1.7.1 v1.7.0 v1.6.3 v1.6.2 v1.6.1 v1.6.0 v1.6.0-rc0 v1.5.2 v1.5.1 v1.5.0 v1.4.1 v1.4.0 v1.3.2 v1.3.1 v1.3.0 v1.2.1 v1.2.0 v1.1.0 v1.0.2 v1.0.1 v1.0.0 v1.0.0-rc0 v0.15.0 v0.15.0-rc0 v0.14.0 v0.13.0 v0.12.0 v0.11.1a2 v0.11.1a1 v0.11.0 v0.10.0 v0.10.0rc4 v0.10.0rc lite-v0.1
3 合并请求!11636[IMPORTANT] MKLDNN layout: Support for sum operator,!2081Release/0.10.0,!854Cmrnorm
......@@ -381,57 +381,45 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
CHECK_SYNC("hl_avgpool_backward failed");
}
__global__ void KeCMRNormFillScale(size_t nthreads, const real* in,
__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
real* scale, size_t channels,
size_t height, size_t width, size_t size,
real alpha) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < nthreads) {
// find out the local offset
size_t w = index % width;
size_t h = (index / width) % height;
size_t n = index / width / height;
size_t offset = (n * channels * height + h) * width + w;
size_t step = height * width;
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) {
const int w = idx % width;
const int h = (idx / width) % height;
const int n = idx / width / height;
const int offset = (n * channels * height + h) * width + w;
in += offset;
scale += offset;
size_t head = 0;
size_t pre_pad = (size - 1) / 2;
size_t post_pad = size - pre_pad - 1;
real accum_scale = 0;
// fill the scale at [n, :, h, w]
// accumulate values
while (head < post_pad) {
accum_scale += in[head * step] * in[head * step];
++head;
}
// until we reach size, nothing needs to be subtracted
while (head < size) {
accum_scale += in[head * step] * in[head * step];
scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
++head;
}
// both add and subtract
while (head < channels) {
accum_scale += in[head * step] * in[head * step];
accum_scale -= in[(head - size) * step] * in[(head - size) * step];
scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
++head;
}
// subtract only
while (head < channels + post_pad) {
accum_scale -= in[(head - size) * step] * in[(head - size) * step];
scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
++head;
const int step = height * width;
const int pre_pad = (size - 1) / 2;
const int post_pad = size - pre_pad - 1;
real accum = 0;
int index = 0;
while (index < channels + post_pad) {
if (index < channels) {
accum += in[index * step] * in[index * step];
}
if (index >= size) {
accum -= in[(index - size) * step] * in[(index - size) * step];
}
if (index >= post_pad) {
scale[(index - post_pad) * step] = 1. + accum * alpha;
}
++index;
}
}
}
__global__ void KeCMRNormOutput(size_t nthreads, const real* in,
const real* scale, real negative_beta,
real* out) {
size_t index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < nthreads) {
__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
const real* scale, real negative_beta,
real* out) {
const int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < inputSize) {
out[index] = in[index] * pow(scale[index], negative_beta);
}
}
......@@ -440,84 +428,60 @@ void hl_CMRNorm_forward(size_t frameCnt, const real* in, real* scale,
real* out, size_t channels,
size_t height, size_t width, size_t sizeX,
real alpha, real beta) {
size_t threadsNum = frameCnt * height * width;
size_t blocksX = (threadsNum + 1024 - 1) / 1024;
size_t blocksY = 1;
dim3 threads(1024, 1);
dim3 grid(blocksX, blocksY);
KeCMRNormFillScale<<<grid, threads, 0, STREAM_DEFAULT>>>
(threadsNum, in, scale, channels, height, width, sizeX, alpha);
threadsNum = frameCnt * height * width *channels;
blocksX = (threadsNum + 1024 -1) / 1024;
dim3 threads2(1024, 1);
dim3 grid2(blocksX, blocksY);
KeCMRNormOutput<<<grid2, threads2, 0, STREAM_DEFAULT>>>
(threadsNum, in, scale, beta, out);
size_t imageSize = frameCnt * height * width;
int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
(imageSize, in, scale, channels, height, width, sizeX, alpha);
size_t inputSize = frameCnt * height * width *channels;
blockSize = 1024;
gridSize = (inputSize + 1024 - 1) / 1024;
KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
(inputSize, in, scale, beta, out);
CHECK_SYNC("hl_CMRNorm_forward");
}
__global__ void KeCMRNormDiff(size_t nthreads, const real* bottom_data,
__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
const real* top_data, const real* scale,
const real* top_diff, size_t channels,
size_t height, size_t width, size_t size,
real negative_beta, real cache_ratio,
real* bottom_diff ) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < nthreads) {
// find out the local offset
size_t w = index % width;
size_t h = (index / width) % height;
size_t n = index / width / height;
size_t offset = (n * channels * height + h) * width + w;
size_t step = height * width;
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < imageSize) {
const int w = idx % width;
const int h = (idx / width) % height;
const int n = idx / width / height;
const int offset = (n * channels * height + h) * width + w;
bottom_data += offset;
top_data += offset;
scale += offset;
top_diff += offset;
bottom_diff += offset;
int head = 0;
int pre_pad = size - (size + 1) / 2;
int post_pad = size - pre_pad - 1;
real accum_ratio = 0;
// accumulate values
while (head < post_pad) {
accum_ratio += top_diff[head * step] *
top_data[head * step] / scale[head * step];
++head;
}
// until we reach size, nothing needs to be subtracted
while (head < size) {
accum_ratio += top_diff[head * step] *
top_data[head * step] / scale[head * step];
bottom_diff[(head - post_pad) * step] +=
top_diff[(head - post_pad) * step] *
pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
bottom_data[(head - post_pad) * step] * accum_ratio;
++head;
}
// both add and subtract
while (head < channels) {
accum_ratio += top_diff[head * step] * top_data[head * step] /
scale[head * step];
accum_ratio -= top_diff[(head - size) * step] *
top_data[(head - size) * step] / scale[(head - size) * step];
bottom_diff[(head - post_pad) * step] +=
top_diff[(head - post_pad) * step] *
pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
bottom_data[(head - post_pad) * step] * accum_ratio;
++head;
}
// subtract only
while (head < channels + post_pad) {
accum_ratio -= top_diff[(head - size) * step] *
top_data[(head - size) * step] / scale[(head - size) * step];
bottom_diff[(head - post_pad) * step] +=
top_diff[(head - post_pad) * step] *
pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
bottom_data[(head - post_pad) * step] * accum_ratio;
++head;
const int step = height * width;
const int pre_pad = size - (size + 1) / 2;
const int post_pad = size - pre_pad - 1;
int index = 0;
real accum = 0;
while (index < channels + post_pad) {
if (index < channels) {
accum += top_diff[index * step] * top_data[index * step] /
scale[index * step];
}
if (index >= size) {
accum -= top_diff[(index - size) * step] *
top_data[(index - size) * step] / scale[(index - size) * step];
}
if (index >= post_pad) {
bottom_diff[(index - post_pad) * step] +=
top_diff[(index - post_pad) * step] *
pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
bottom_data[(index - post_pad) * step] * accum;
}
++index;
}
}
}
......@@ -528,14 +492,12 @@ void hl_CMRNorm_backward(size_t frameCnt, const real* inV,
real *inDiff, size_t channels,
size_t height, size_t width, size_t sizeX,
real alpha, real beta) {
size_t threadsNum = frameCnt * height * width;
size_t blocksX = (threadsNum + 1024 - 1) / 1024;
size_t blocksY = 1;
dim3 threads(1024, 1);
dim3 grid(blocksX, blocksY);
KeCMRNormDiff <<<grid, threads, 0, STREAM_DEFAULT>>>
(threadsNum, inV, outV, scale, outDiff, channels,
height, width, sizeX, alpha, beta, inDiff);
size_t imageSize = frameCnt * height * width;
int blockSize = 1024;
int gridSize = (imageSize + 1024 - 1) / 1024;
KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
(imageSize, inV, outV, scale, outDiff, channels,
height, width, sizeX, alpha, beta, inDiff);
CHECK_SYNC("hl_CMRNorm_backward");
}
......
......@@ -1021,11 +1021,10 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
testLayerGrad(config, "norm", 100, trans, useGpu);
}
#ifndef PADDLE_ONLY_CPU
TEST(Layer, NormLayer) {
testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false);
}
#endif
void setPoolConfig(TestConfig* config,
PoolConfig* pool,
......
......@@ -2227,52 +2227,43 @@ void CpuMatrix::crossMapNormalFwd(Matrix& input,
size_t sizeX,
float scale,
float pow) {
size_t num = input.getHeight();
CHECK(isContiguous());
CHECK(input.isContiguous());
CHECK(denoms.isContiguous());
CHECK_EQ(getHeight(), input.getHeight());
CHECK_EQ(getWidth(), input.getWidth());
CHECK_EQ(getHeight(), denoms.getHeight());
CHECK_EQ(getWidth(), denoms.getWidth());
size_t numSample = input.getHeight();
size_t numCols = input.getWidth();
size_t height = imgSizeH;
size_t width = imgSizeW;
size_t numCols = input.getWidth();
CHECK(height * width * channels == input.getWidth());
CHECK(denoms.getHeight() == input.getHeight() &&
denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
input.getWidth() == width_);
real* imgData = input.getData();
real* diffData = input.getData();
real* targetData = getData();
size_t halfSize = sizeX / 2;
size_t imgPixels = height * width;
// use integral vector to implement the sum in local window
real* integralData =
(real*)malloc((channels + sizeX + 1) * sizeof(real)); // NOLINT // TODO:
for (size_t i = 0; i <= halfSize; i++) {
integralData[i] = 0;
}
for (size_t i = 0; i < num; i++) {
real* targetPtr = targetData + i * numCols;
real* imgPtr = imgData + i * numCols;
real* diffPtr = diffData + i * numCols;
for (size_t m = 0; m < height; m++) {
for (size_t n = 0; n < width; n++) {
for (size_t c = 0; c < channels; c++) {
integralData[c + halfSize + 1] =
integralData[c + halfSize] + _square(*(diffPtr + c * imgPixels));
}
for (size_t k = channels + halfSize + 1; k <= channels + sizeX; k++) {
integralData[k] = integralData[channels + halfSize];
CHECK(height * width * channels == numCols);
// TODO(hedaoyuan) After commit TensorExpress code,
// Reconstruction this code to remove the temporary memory.
CpuMatrix tmp(channels, height * width);
CpuMatrix tmp2(tmp.getData(), 1, channels * height * width);
denoms.zero();
const int start = -((int)sizeX - 1) / 2;
const int end = (int)sizeX + start;
for (size_t i = 0; i < numSample; i++) {
input.subMatrix(i, 1)->square2(tmp2);
CpuMatrix subDen(
denoms.subMatrix(i, 1)->getData(), channels, height * width);
for (int c = 0; c < (int)channels; c++) {
for (int s = start; s < end; s++) {
if (c + s >= 0 && c + s < (int)channels) {
subDen.subMatrix(c, 1)->add(*tmp.subMatrix(c + s, 1));
}
for (size_t k = 0; k < channels; k += 1) {
real a = integralData[k + sizeX] - integralData[k];
a = scale * a + 1;
targetPtr[k * imgPixels] = imgPtr[k * imgPixels] * _pow(a, -pow);
}
diffPtr++;
targetPtr++;
imgPtr++;
}
}
}
free(integralData);
integralData = NULL;
denoms.add(scale, (real)1);
this->pow2(denoms, -pow);
this->dotMul(input);
}
void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
......@@ -2282,19 +2273,63 @@ void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
size_t channels,
size_t imgSizeH,
size_t imgSizeW,
size_t size,
size_t sizeX,
float scale,
float pow) {
LOG(FATAL) << "Not implemented";
CHECK(imgSizeH * imgSizeW * channels == preOutV.getWidth());
CHECK(denoms.getHeight() == preOutV.getHeight() &&
denoms.getWidth() == preOutV.getWidth() &&
preOutV.getHeight() == height_ && preOutV.getWidth() == width_);
CHECK(denoms.getHeight() == localGrad.getHeight() &&
denoms.getWidth() == localGrad.getWidth());
// NOLINT // TODO:
CHECK(isContiguous());
CHECK(localGrad.isContiguous());
CHECK(denoms.isContiguous());
CHECK(preOutV.isContiguous());
CHECK(localOutV.isContiguous());
CHECK_EQ(getHeight(), localGrad.getHeight());
CHECK_EQ(getWidth(), localGrad.getWidth());
CHECK_EQ(getHeight(), denoms.getHeight());
CHECK_EQ(getWidth(), denoms.getWidth());
CHECK_EQ(getHeight(), preOutV.getHeight());
CHECK_EQ(getWidth(), preOutV.getWidth());
CHECK_EQ(getHeight(), localOutV.getHeight());
CHECK_EQ(getWidth(), localOutV.getWidth());
size_t numSample = getHeight();
size_t numCols = getWidth();
size_t height = imgSizeH;
size_t width = imgSizeW;
CHECK(height * width * channels == numCols);
// TODO(hedaoyuan) After commit TensorExpress code,
// Reconstruction this code to remove the temporary memory.
CpuMatrix tmp(1, height * width);
const int start = -((int)sizeX) / 2;
const int end = (int)sizeX + start;
const real ratio = -(real)2 * scale * pow;
for (size_t i = 0; i < numSample; i++) {
CpuMatrix inputDiff(
this->subMatrix(i, 1)->getData(), channels, height * width);
CpuMatrix outDiff(
localGrad.subMatrix(i, 1)->getData(), channels, height * width);
CpuMatrix input(
preOutV.subMatrix(i, 1)->getData(), channels, height * width);
CpuMatrix output(
localOutV.subMatrix(i, 1)->getData(), channels, height * width);
CpuMatrix subDen(
denoms.subMatrix(i, 1)->getData(), channels, height * width);
for (int c = 0; c < (int)channels; c++) {
tmp.pow2(*subDen.subMatrix(c, 1), -pow);
inputDiff.subMatrix(c, 1)
->addDotMul(tmp, *outDiff.subMatrix(c, 1), (real)1, (real)1);
for (int s = start; s < end; s++) {
if (c + s >= 0 && c + s < (int)channels) {
tmp.dotMul(*outDiff.subMatrix(c + s, 1), *output.subMatrix(c + s, 1));
tmp.mulScalar(ratio);
tmp.dotDiv(tmp, *subDen.subMatrix(c + s, 1));
tmp.dotMul(*input.subMatrix(c, 1));
inputDiff.subMatrix(c, 1)->add(tmp);
}
}
}
}
}
/**
......
......@@ -1261,6 +1261,121 @@ TEST(Matrix, MaxOutFwdBwd) {
}
}
}
void testCrossMapNormalFwd(
int numSamples, int channels, int imgSizeH, int imgSizeW, int sizeX) {
float scale = 1.5;
float pow = 0.5;
int width = imgSizeH * imgSizeW * channels;
MatrixPtr input = CpuMatrix::create(numSamples, width, false, false);
MatrixPtr denorms = CpuMatrix::create(numSamples, width, false, false);
MatrixPtr target = CpuMatrix::create(numSamples, width, false, false);
MatrixPtr inputGpu = GpuMatrix::create(numSamples, width, false, true);
MatrixPtr denormsGpu = GpuMatrix::create(numSamples, width, false, true);
MatrixPtr targetGpu = GpuMatrix::create(numSamples, width, false, true);
input->randomizeUniform();
target->randomizeUniform();
inputGpu->copyFrom(*input);
targetGpu->copyFrom(*target);
target->crossMapNormalFwd(
*input, imgSizeH, imgSizeW, *denorms, channels, sizeX, scale, pow);
targetGpu->crossMapNormalFwd(
*inputGpu, imgSizeH, imgSizeW, *denormsGpu, channels, sizeX, scale, pow);
TensorCheckErr(*target, *targetGpu);
TensorCheckErr(*denorms, *denormsGpu);
}
TEST(Matrix, crossMapNormalFwd) {
for (auto numSamples : {5, 32}) {
for (auto channels : {1, 5, 32}) {
for (auto imgSizeH : {5, 33, 100}) {
for (auto imgSizeW : {5, 32, 96}) {
for (auto sizeX : {1, 2, 3, 5, 7}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
<< " sizeX=" << sizeX;
testCrossMapNormalFwd(
numSamples, channels, imgSizeH, imgSizeW, sizeX);
}
}
}
}
}
}
void testCrossMapNormalBwd(
int numSamples, int channels, int imgSizeH, int imgSizeW, int sizeX) {
float scale = 1.5;
float pow = 0.5;
size_t width = imgSizeH * imgSizeW * channels;
MatrixPtr localGrad = CpuMatrix::create(numSamples, width, false, false);
MatrixPtr denoms = CpuMatrix::create(numSamples, width, false, false);
MatrixPtr output = CpuMatrix::create(numSamples, width, false, false);
MatrixPtr preOutV = CpuMatrix::create(numSamples, width, false, false);
MatrixPtr localOutV = CpuMatrix::create(numSamples, width, false, false);
localGrad->randomizeUniform();
denoms->randomizeUniform();
preOutV->randomizeUniform();
localOutV->randomizeUniform();
output->randomizeUniform();
denoms->add(0.01);
MatrixPtr localGradGpu = GpuMatrix::create(numSamples, width, false, true);
MatrixPtr denomsGpu = GpuMatrix::create(numSamples, width, false, true);
MatrixPtr outputGpu = GpuMatrix::create(numSamples, width, false, true);
MatrixPtr preOutVGpu = GpuMatrix::create(numSamples, width, false, true);
MatrixPtr localOutVGpu = GpuMatrix::create(numSamples, width, false, true);
localGradGpu->copyFrom(*localGrad);
denomsGpu->copyFrom(*denoms);
preOutVGpu->copyFrom(*preOutV);
localOutVGpu->copyFrom(*localOutV);
outputGpu->copyFrom(*output);
output->crossMapNormalBwd(*localGrad,
*denoms,
*preOutV,
*localOutV,
channels,
imgSizeH,
imgSizeW,
sizeX,
scale,
pow);
outputGpu->crossMapNormalBwd(*localGradGpu,
*denomsGpu,
*preOutVGpu,
*localOutVGpu,
channels,
imgSizeH,
imgSizeW,
sizeX,
scale,
pow);
TensorCheckErr(*output, *outputGpu);
}
TEST(Matrix, crossMapNormalBwd) {
for (auto numSamples : {5, 32}) {
for (auto channels : {1, 5, 32}) {
for (auto imgSizeH : {5, 33, 100}) {
for (auto imgSizeW : {5, 32, 96}) {
for (auto sizeX : {1, 2, 3, 5, 7}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
<< " sizeX=" << sizeX;
testCrossMapNormalBwd(
numSamples, channels, imgSizeH, imgSizeW, sizeX);
}
}
}
}
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部