From f27ece50c9c277c6ae11c1a69bd464e5add6e3fd Mon Sep 17 00:00:00 2001 From: jpark37 Date: Fri, 26 Jul 2019 20:45:33 -0700 Subject: [PATCH] libobs: Optimize lanczos shader, remove scaling Use bilinear filtering to reduce 36 taps to 25 for the regular path. This works because the middle weights are always between 0 and 1, allowing texture coordinates to be placed strategically to sample correct ratios. I'm not sure about the undistort path, so I've left that alone. Also remove scaling added in #526, after which weight normalization is unnecessary. If we want to use or invent an algorithm with alternate downscaling properties, that's fine, but I don't think we should change Lanczos scaling to mean something it's not. The scale implementation was also seen not working when applied directly to scene items because of assumptions made about the projection matrix. Intel GPA, SetStablePowerState, Intel HD Graphics 530, D3D11 644x478 -> 1323x1080: 3890 us -> 3401 us 1920x1080 -> 1280x720: 2555 us -> 2261 us --- libobs/data/lanczos_scale.effect | 196 ++++++++++++++++++++----------- 1 file changed, 125 insertions(+), 71 deletions(-) diff --git a/libobs/data/lanczos_scale.effect b/libobs/data/lanczos_scale.effect index 534b3c53..f1f05472 100644 --- a/libobs/data/lanczos_scale.effect +++ b/libobs/data/lanczos_scale.effect @@ -6,6 +6,7 @@ uniform float4x4 ViewProj; uniform texture2d image; +uniform float2 base_dimension; uniform float2 base_dimension_i; uniform float undistort_factor = 1.0; @@ -21,45 +22,47 @@ struct VertData { float2 uv : TEXCOORD0; }; -struct FragData { +struct VertOut { + float2 uv : TEXCOORD0; float4 pos : POSITION; - float2 uv : TEXCOORD0; - float2 scale : TEXCOORD1; }; -FragData VSDefault(VertData v_in) +struct FragData { + float2 uv : TEXCOORD0; +}; + +VertOut VSDefault(VertData v_in) { - FragData vert_out; - vert_out.pos = mul(float4(v_in.pos.xyz, 1.0), ViewProj); + VertOut vert_out; vert_out.uv = v_in.uv; - vert_out.scale = min(0.25 + abs(0.75 / mul(float4(1.0 / base_dimension_i.xy, 1.0, 1.0), ViewProj).xy), 1.0); + vert_out.pos = mul(float4(v_in.pos.xyz, 1.0), ViewProj); return vert_out; } -float sinc(float x) -{ - const float PIval = 3.1415926535897932384626433832795; - return sin(x * PIval) / (x * PIval); -} - -float weight(float x, float radius) +float weight(float x) { float ax = abs(x); if (x == 0.0) return 1.0; - else if (ax < radius) - return sinc(x) * sinc(x / radius); - else - return 0.0; + + float radius = 3.0; + if (ax < radius) { + float PIval = 3.14159265358979323846; + float x_pi = x * PIval; + float radius_i = 1.0 / 3.0; + return radius * sin(x_pi) * sin(x_pi * radius_i) / (x_pi * x_pi); + } + + return 0.0; } -float3 weight3(float x, float scale) +float3 weight3(float x) { return float3( - weight((x * 2.0 + 0.0 * 2.0 - 3.0) * scale, 3.0), - weight((x * 2.0 + 1.0 * 2.0 - 3.0) * scale, 3.0), - weight((x * 2.0 + 2.0 * 2.0 - 3.0) * scale, 3.0)); + weight(x * 2.0 - 3.0), + weight(x * 2.0 - 1.0), + weight(x * 2.0 + 1.0)); } float AspectUndistortX(float x, float a) @@ -74,72 +77,123 @@ float AspectUndistortU(float u) return AspectUndistortX((u - 0.5) * 2.0, undistort_factor) * 0.5 + 0.5; } -float2 pixel_coord(float xpos, float ypos) +float2 undistort_coord(float xpos, float ypos) { return float2(AspectUndistortU(xpos), ypos); } -float4 pixel(float xpos, float ypos, bool undistort) +float4 undistort_pixel(float xpos, float ypos) { - if (undistort) - return image.Sample(textureSampler, pixel_coord(xpos, ypos)); - else - return image.Sample(textureSampler, float2(xpos, ypos)); + return image.Sample(textureSampler, undistort_coord(xpos, ypos)); } -float4 get_line(float ypos, float3 xpos1, float3 xpos2, float3 rowtap1, - float3 rowtap2, bool undistort) +float4 undistort_line(float3 xpos012, float3 xpos345, float ypos, float3 rowtap024, + float3 rowtap135) { return - pixel(xpos1.r, ypos, undistort) * rowtap1.r + - pixel(xpos1.g, ypos, undistort) * rowtap2.r + - pixel(xpos1.b, ypos, undistort) * rowtap1.g + - pixel(xpos2.r, ypos, undistort) * rowtap2.g + - pixel(xpos2.g, ypos, undistort) * rowtap1.b + - pixel(xpos2.b, ypos, undistort) * rowtap2.b; + undistort_pixel(xpos012.x, ypos) * rowtap024.x + + undistort_pixel(xpos012.y, ypos) * rowtap135.x + + undistort_pixel(xpos012.z, ypos) * rowtap024.y + + undistort_pixel(xpos345.x, ypos) * rowtap135.y + + undistort_pixel(xpos345.y, ypos) * rowtap024.z + + undistort_pixel(xpos345.z, ypos) * rowtap135.z; } -float4 DrawLanczos(FragData v_in, bool undistort) +float4 DrawLanczos(FragData f_in, bool undistort) { float2 stepxy = base_dimension_i; - float2 pos = v_in.uv + stepxy * 0.5; - float2 f = frac(pos / stepxy); - - float3 rowtap1 = weight3((1.0 - f.x) / 2.0, v_in.scale.x); - float3 rowtap2 = weight3((1.0 - f.x) / 2.0 + 0.5, v_in.scale.x); - float3 coltap1 = weight3((1.0 - f.y) / 2.0, v_in.scale.y); - float3 coltap2 = weight3((1.0 - f.y) / 2.0 + 0.5, v_in.scale.y); - - /* make sure all taps added together is exactly 1.0, otherwise some - * (very small) distortion can occur */ - float suml = rowtap1.r + rowtap1.g + rowtap1.b + rowtap2.r + rowtap2.g + rowtap2.b; - float sumc = coltap1.r + coltap1.g + coltap1.b + coltap2.r + coltap2.g + coltap2.b; - rowtap1 /= suml; - rowtap2 /= suml; - coltap1 /= sumc; - coltap2 /= sumc; - - float2 xystart = (-2.5 - f) * stepxy + pos; - float3 xpos1 = float3(xystart.x , xystart.x + stepxy.x , xystart.x + stepxy.x * 2.0); - float3 xpos2 = float3(xystart.x + stepxy.x * 3.0, xystart.x + stepxy.x * 4.0, xystart.x + stepxy.x * 5.0); + float2 pos = f_in.uv + stepxy * 0.5; + float2 f = frac(pos * base_dimension); + + float2 f_rev_half = (-0.5) * f + 0.5; + float3 rowtap024 = weight3(f_rev_half.x); + float3 rowtap135 = weight3(f_rev_half.x + 0.5); + float3 coltap024 = weight3(f_rev_half.y); + float3 coltap135 = weight3(f_rev_half.y + 0.5); + + float2 uv0 = (-2.5 - f) * stepxy + pos; + float2 uv1 = uv0 + stepxy; + float2 uv2 = uv1 + stepxy; + float2 uv3 = uv2 + stepxy; + float2 uv4 = uv3 + stepxy; + float2 uv5 = uv4 + stepxy; + + if (undistort) { + float3 xpos012 = float3(uv0.x, uv1.x, uv2.x); + float3 xpos345 = float3(uv3.x, uv4.x, uv5.x); + return undistort_line(xpos012, xpos345, uv0.y, rowtap024, rowtap135) * coltap024.x + + undistort_line(xpos012, xpos345, uv1.y, rowtap024, rowtap135) * coltap135.x + + undistort_line(xpos012, xpos345, uv2.y, rowtap024, rowtap135) * coltap024.y + + undistort_line(xpos012, xpos345, uv3.y, rowtap024, rowtap135) * coltap135.y + + undistort_line(xpos012, xpos345, uv4.y, rowtap024, rowtap135) * coltap024.z + + undistort_line(xpos012, xpos345, uv5.y, rowtap024, rowtap135) * coltap135.z; + } - return - get_line(xystart.y , xpos1, xpos2, rowtap1, rowtap2, undistort) * coltap1.r + - get_line(xystart.y + stepxy.y , xpos1, xpos2, rowtap1, rowtap2, undistort) * coltap2.r + - get_line(xystart.y + stepxy.y * 2.0, xpos1, xpos2, rowtap1, rowtap2, undistort) * coltap1.g + - get_line(xystart.y + stepxy.y * 3.0, xpos1, xpos2, rowtap1, rowtap2, undistort) * coltap2.g + - get_line(xystart.y + stepxy.y * 4.0, xpos1, xpos2, rowtap1, rowtap2, undistort) * coltap1.b + - get_line(xystart.y + stepxy.y * 5.0, xpos1, xpos2, rowtap1, rowtap2, undistort) * coltap2.b; + float u_weight_sum = rowtap024.y + rowtap135.y; + float u_middle_offset = rowtap135.y * stepxy.x / u_weight_sum; + float u_middle = uv2.x + u_middle_offset; + + float v_weight_sum = coltap024.y + coltap135.y; + float v_middle_offset = coltap135.y * stepxy.y / v_weight_sum; + float v_middle = uv2.y + v_middle_offset; + + float2 coord_limit = base_dimension - 0.5; + float2 coord0_f = max(uv0 * base_dimension, 0.5); + float2 coord1_f = coord0_f + 1.0; + float2 coord4_f = min(coord0_f + 4.0, coord_limit); + float2 coord5_f = min(coord0_f + 5.0, coord_limit); + + int2 coord0 = int2(coord0_f); + int2 coord1 = int2(coord1_f); + int2 coord4 = int2(coord4_f); + int2 coord5 = int2(coord5_f); + + float4 row0 = image.Load(int3(coord0, 0)) * rowtap024.x; + row0 += image.Load(int3(coord1.x, coord0.y, 0))* rowtap135.x; + row0 += image.Sample(textureSampler, float2(u_middle, uv0.y)) * u_weight_sum; + row0 += image.Load(int3(coord4.x, coord0.y, 0)) * rowtap024.z; + row0 += image.Load(int3(coord5.x, coord0.y, 0)) * rowtap135.z; + float4 total = row0 * coltap024.x; + + float4 row1 = image.Load(int3(coord0.x, coord1.y, 0)) * rowtap024.x; + row1 += image.Load(int3(coord1.x, coord1.y, 0))* rowtap135.x; + row1 += image.Sample(textureSampler, float2(u_middle, uv1.y)) * u_weight_sum; + row1 += image.Load(int3(coord4.x, coord1.y, 0)) * rowtap024.z; + row1 += image.Load(int3(coord5.x, coord1.y, 0)) * rowtap135.z; + total += row1 * coltap135.x; + + float4 row23 = image.Sample(textureSampler, float2(uv0.x, v_middle)) * rowtap024.x; + row23 += image.Sample(textureSampler, float2(uv1.x, v_middle))* rowtap135.x; + row23 += image.Sample(textureSampler, float2(u_middle, v_middle)) * u_weight_sum; + row23 += image.Sample(textureSampler, float2(uv4.x, v_middle)) * rowtap024.z; + row23 += image.Sample(textureSampler, float2(uv5.x, v_middle)) * rowtap135.z; + total += row23 * v_weight_sum; + + float4 row4 = image.Load(int3(coord0.x, coord4.y, 0)) * rowtap024.x; + row4 += image.Load(int3(coord1.x, coord4.y, 0))* rowtap135.x; + row4 += image.Sample(textureSampler, float2(u_middle, uv4.y)) * u_weight_sum; + row4 += image.Load(int3(coord4.x, coord4.y, 0)) * rowtap024.z; + row4 += image.Load(int3(coord5.x, coord4.y, 0)) * rowtap135.z; + total += row4 * coltap024.z; + + float4 row5 = image.Load(int3(coord0.x, coord5.y, 0)) * rowtap024.x; + row5 += image.Load(int3(coord1.x, coord5.y, 0))* rowtap135.x; + row5 += image.Sample(textureSampler, float2(u_middle, uv5.y)) * u_weight_sum; + row5 += image.Load(int3(coord4.x, coord5.y, 0)) * rowtap024.z; + row5 += image.Load(int3(coord5, 0)) * rowtap135.z; + total += row5 * coltap135.z; + + return total; } -float4 PSDrawLanczosRGBA(FragData v_in, bool undistort) : TARGET +float4 PSDrawLanczosRGBA(FragData f_in, bool undistort) : TARGET { - return DrawLanczos(v_in, undistort); + return DrawLanczos(f_in, undistort); } -float4 PSDrawLanczosRGBADivide(FragData v_in) : TARGET +float4 PSDrawLanczosRGBADivide(FragData f_in) : TARGET { - float4 rgba = DrawLanczos(v_in, false); + float4 rgba = DrawLanczos(f_in, false); float alpha = rgba.a; float multiplier = (alpha > 0.0) ? (1.0 / alpha) : 0.0; return float4(rgba.rgb * multiplier, alpha); @@ -150,7 +204,7 @@ technique Draw pass { vertex_shader = VSDefault(v_in); - pixel_shader = PSDrawLanczosRGBA(v_in, false); + pixel_shader = PSDrawLanczosRGBA(f_in, false); } } @@ -159,7 +213,7 @@ technique DrawAlphaDivide pass { vertex_shader = VSDefault(v_in); - pixel_shader = PSDrawLanczosRGBADivide(v_in); + pixel_shader = PSDrawLanczosRGBADivide(f_in); } } @@ -168,6 +222,6 @@ technique DrawUndistort pass { vertex_shader = VSDefault(v_in); - pixel_shader = PSDrawLanczosRGBA(v_in, true); + pixel_shader = PSDrawLanczosRGBA(f_in, true); } } -- GitLab