新增op和Tensor实现耗时对比 (#23237) · Issue · PaddlePaddle / Paddle

新增op和Tensor实现耗时对比

Created by: zhengzhe97

一个功能，分别使用新增python op和Tensor实现，耗时相差较大

python op实现

def crop(masks, boxes):
    """
    "Crop" predicted masks by zeroing out everything not in the predicted bbox.
    Vectorized by Chong (thanks Chong).

    Args:
        - masks should be a size [h, w, n] tensor of masks
        - boxes should be a size [n, 4] tensor of bbox coords in relative point form
    """
    start_time = time.time()
    padding = 1
    masks = np.array(masks)
    boxes = np.array(boxes)
    h, w, n = np.shape(masks)
    x1, x2 = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, padding, cast=False)
    y1, y2 = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, padding, cast=False)
    rows = np.broadcast_to(np.reshape(np.arange(w, dtype=x1.dtype),(1, -1, 1)),(h, w, n))
    cols = np.broadcast_to(np.reshape(np.arange(h, dtype=x1.dtype),(-1, 1, 1)),(h, w, n))
    # print('rows',np.sum(rows))
    
    masks_left  = rows >= np.reshape(x1, (1, 1, -1))
    masks_right = rows <  np.reshape(x2, (1, 1, -1))
    masks_up    = cols >= np.reshape(y1, (1, 1, -1))
    masks_down  = cols <  np.reshape(y2, (1, 1, -1))
    
    crop_mask = masks_left * masks_right * masks_up * masks_down
    
    # print('crop_mask:',np.sum(crop_mask))
    end_time = time.time()
    # print('time1',end_time - start_time)
    return crop_mask.astype('float32')

def sanitize_coordinates(_x1, _x2, img_size:int, padding:int=0, cast:bool=True):
    """
    Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0, and x2 <= image_size.
    Also converts from relative to absolute coordinates and casts the results to long tensors.

    If cast is false, the result won't be cast to longs.
    Warning: this does things in-place behind the scenes so copy if necessary.
    """
    _x1 = _x1 * img_size
    _x2 = _x2 * img_size
    if cast:
        _x1 = _x1.astype('int32')
        _x2 = _x2.astype('int32')
    x1 = numpyminmax(_x1, _x2, False)
    x2 = numpyminmax(_x1, _x2)
    x1 = np.clip(x1-padding, a_min=0, a_max=1000000)
    x2 = np.clip(x2+padding, a_min=-1000000, a_max=img_size)

    return x1, x2

Tensor实现：

def crop_tensor(masks, boxes):
    # fluid.layers.py_func(func=start_time, x=fluid.layers.shape(masks) ,out=None)
    padding = 1
    s = fluid.layers.shape(masks)
    h = fluid.layers.cast(s[0], 'float32')
    w = fluid.layers.cast(s[1], 'float32')
    n = fluid.layers.cast(s[2], 'float32')
    x1, x2 = sanitize_coordinates_tensor(boxes[:, 0], boxes[:, 2], w, padding, cast=False)
    y1, y2 = sanitize_coordinates_tensor(boxes[:, 1], boxes[:, 3], h, padding, cast=False)

    rows = fluid.layers.expand_as(fluid.layers.reshape(fluid.layers.range(0, w, 1, 'float32'), shape=(1, -1, 1)), target_tensor=masks)
    cols = fluid.layers.expand_as(fluid.layers.reshape(fluid.layers.range(0, h, 1, 'float32'), shape=(-1, 1, 1)), target_tensor=masks)
    # fluid.layers.py_func(func=debug_sum, x=rows ,out=None)
    
    masks_left  = rows >= fluid.layers.reshape(x1, shape=(1,1,-1))
    masks_right = rows <  fluid.layers.reshape(x2, shape=(1,1,-1))
    masks_up    = cols >= fluid.layers.reshape(y1, shape=(1,1,-1))
    masks_down  = cols <  fluid.layers.reshape(y2, shape=(1,1,-1))
    
    masks_left  = fluid.layers.cast(masks_left, 'float32')
    masks_right = fluid.layers.cast(masks_right, 'float32')
    masks_up    = fluid.layers.cast(masks_up, 'float32')
    masks_down  = fluid.layers.cast(masks_down, 'float32')
    
    crop_mask = masks_left * masks_right * masks_up * masks_down
    # fluid.layers.py_func(func=end_time, x=fluid.layers.shape(masks) ,out=None)
    crop_mask.stop_gradient = True
    # fluid.layers.py_func(func=debug_sum, x=crop_mask ,out=None) 
    return masks * crop_mask
    
def sanitize_coordinates_tensor(_x1, _x2, img_size, padding:int=0, cast:bool=True):
    _x1 = fluid.layers.elementwise_mul(fluid.layers.cast(_x1, 'float32'), img_size)
    _x2 = fluid.layers.elementwise_mul(fluid.layers.cast(_x2, 'float32'), img_size)
    if cast:
        _x1 = fluid.layers.cast(_x1, 'int32')
        _x2 = fluid.layers.cast(_x2, 'int32')
    x1 = fluid.layers.elementwise_min(_x1, _x2)
    x2 = fluid.layers.elementwise_max(_x1, _x2)
    x1 = fluid.layers.clip(x=x1-padding, min=0, max=10000)
    x2 = fluid.layers.clip(x=x2+padding, min=-10000, max=144)
    
    return x1, x2

对于Tensor采用前后添加如下op

对于python op,直接使用time.clock()，计算前后时差

发现基于Tensor的实现慢了好多，想知道是什么原因

PaddlePaddle / Paddle 大约 2 年 前同步成功

新增op和Tensor实现耗时对比

PaddlePaddle / Paddle
大约 2 年前同步成功