# Copyright 2019 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """operator dsl function:reduce_max_ad""" import akg.tvm import akg import akg.backend as cce from akg.utils import kernel_exec as utils from akg.utils import custom_tiling as ct_util from akg.ops.math.cast import cast from akg.ops.math import reduce_max from akg.utils import format_transform as ft_util reduce_max_ad_set_dim_map = { } def reduce_max_ad_set_dim_func(data, head, axis, keepdims): key = [] key.append(tuple(data.shape)) key.append(tuple(axis)) key.append(keepdims) hash_key = str(tuple(key)) if hash_key in reduce_max_ad_set_dim_map.keys(): return ct_util.set_dims(reduce_max_ad_set_dim_map[hash_key]) else: return "" @ct_util.reg_set_dim_func(reduce_max_ad_set_dim_func) def reduce_max_ad(head, data, axis, keepdims): b = reduce_max.reduce_max(data, axis, keepdims) _jacs = akg.differentiate(b, [data], head) return _jacs[0] def reduce_max_ad_optimized(head, data, axis, keepdims): def get_shape(pld): return [d.value for d in pld.shape] def custom_reduce_max_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) max_ = akg.lang.cce.reduce_max(data, axis=axis, keepdims=keepdims) max_broadcast = akg.lang.cce.broadcast(max_, shape) return [akg.tvm.compute(shape, lambda *indices: akg.tvm.expr.Select(data(*indices) == max_broadcast(*indices), grad(*get_reduced_indices(*indices, axis=axis, keepdims=keepdims)), akg.tvm.const(0, dtype=data.dtype)), name="reduce_max_ad2")] l = reduce_max.reduce_max(data, axis, keepdims) [dl_ddata] = akg.differentiate(l, [data], head, None, None, override={l: ([data], custom_reduce_max_fdiff)}) return dl_ddata def get_reduced_indices(*indices, axis, keepdims): """Get the adjoint for an arbitrary dimension input.""" # get all indices indices_list = list(indices) # list of reduction axis: transform negative indices into positive # axis in this list wont exist after the reduction axis_list = ft_util.refine_reduce_axis(indices_list, list(axis)) # get indices after reduction if keepdims: grad_indices_list = [index_i if i not in axis_list else 0 for i, index_i in enumerate(indices_list)] else: grad_indices_list = [index_i for i, index_i in enumerate(indices_list) if i not in axis_list] grad_ind = tuple(grad_indices_list) return grad_ind def reduce_max_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.cce.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute(shape, lambda *indices: head_(*get_reduced_indices(*indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute(shape, lambda *indices: akg.tvm.expr.Select(data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [cast(max_values_and_zeros, dtype)] else: return [max_values_and_zeros] # tensor for the input data data = akg.tvm.placeholder(input_shape, dtype, name="input_data") # computation of reduce max # not used on the schedule because this is the diferentiation op l = reduce_max.reduce_max(data, axis, keepdims) # adjoint tensor for the differentiation head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) # cast input data if dtype != 'float16': data_cast = cast(data, "float16") head_cast = cast(head, "float16") else: data_cast = data head_cast = head # override differentiation computation with custom function [dl_ddata] = akg.differentiate(l, [data_cast], head_cast, None, None, override={l: ([data_cast], custom_reduce_max_fdiff)}) # get tensors from custom function if dtype != 'float16': max_values_and_zeros = dl_ddata.op.input_tensors[0] max_broadcast = max_values_and_zeros.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = max_values_and_zeros.op.input_tensors[2] else: max_broadcast = dl_ddata.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = dl_ddata.op.input_tensors[2] # schedule for differetiation operation # inputs: data and head s = akg.tvm.create_schedule([dl_ddata.op]) # cache reads of inputs if dtype != 'float16': head_ub = s.cache_read(head, "local.UB", [head_cast]) data_ub = s.cache_read(data, "local.UB", [data_cast]) else: # no cast operation head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast]) data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata]) # cache write for the output dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] assert len(tiling_factors) == len(dl_ddata.shape) # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index], factor)) # get iterators iterator1 = split_iterators[0][0] # move computation of when there is a cast if dtype != "float16": s[data_cast].compute_at(s[dl_ddata], iterator1) s[data_cast].set_scope("local.UB") s[head_cast].compute_at(s[dl_ddata], iterator1) s[head_cast].set_scope("local.UB") s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1) s[max_values_and_zeros].set_scope("local.UB") # move cache reads and writes s[data_ub].compute_at(s[dl_ddata], iterator1) s[head_ub].compute_at(s[dl_ddata], iterator1) s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1) # move computation of the diferentiation s[max_].compute_at(s[dl_ddata], iterator1) s[max_].set_scope("local.UB") s[max_broadcast].compute_at(s[dl_ddata], iterator1) s[max_broadcast].set_scope("local.UB") s[head_broadcast].compute_at(s[dl_ddata], iterator1) s[head_broadcast].set_scope("local.UB") with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, dl_ddata], "cce", name="reduce_max_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_max_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod