Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
小白菜888
Ffmpeg
提交
422b2362
F
Ffmpeg
项目概览
小白菜888
/
Ffmpeg
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
F
Ffmpeg
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
422b2362
编写于
5月 21, 2011
作者:
L
Loren Merritt
提交者:
Reinhard Tartler
5月 22, 2011
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
dct32_sse: eliminate some spills
125->104 cycles on penryn (x86_64 only)
上级
165c7c42
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
176 addition
and
60 deletion
+176
-60
libavcodec/x86/dct32_sse.asm
libavcodec/x86/dct32_sse.asm
+153
-50
libavcodec/x86/fmtconvert.asm
libavcodec/x86/fmtconvert.asm
+3
-10
libavcodec/x86/x86util.asm
libavcodec/x86/x86util.asm
+20
-0
未找到文件。
libavcodec/x86/dct32_sse.asm
浏览文件 @
422b2362
...
...
@@ -20,7 +20,7 @@
;******************************************************************************
%include "x86inc.asm"
%include "
config
.asm"
%include "
x86util
.asm"
SECTION
_RODATA
32
...
...
@@ -37,8 +37,9 @@ ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
dd
1.000000
,
1.000000
,
1.306563
,
0.541196
dd
1.000000
,
0.707107
,
1.000000
,
-
0.707107
dd
1.000000
,
0.707107
,
1.000000
,
-
0.707107
dd
0.707107
,
0.707107
,
0.707107
,
0.707107
align
32
ps_p1p1m1m1:
dd
0
,
0
,
0x80000000
,
0x80000000
,
0
,
0
,
0x80000000
,
0x80000000
%macro BUTTERFLY_SSE 4
...
...
@@ -77,6 +78,18 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
BUTTERFLY0
%
1
,
%
2
,
%
3
,
%
4
,
0xb1
%endmacro
%macro BUTTERFLY3V 5
movaps
m
%
5
,
m
%
1
addps
m
%
1
,
m
%
2
subps
m
%
5
,
m
%
2
SWAP
%
2
,
%
5
mulps
m
%
2
,
[
ps_cos_vec
+
192
]
movaps
m
%
5
,
m
%
3
addps
m
%
3
,
m
%
4
subps
m
%
4
,
m
%
5
mulps
m
%
4
,
[
ps_cos_vec
+
192
]
%endmacro
%macro PASS6_AND_PERMUTE 0
mov
tmpd
,
[
outq
+
4
]
movss
m7
,
[
outq
+
72
]
...
...
@@ -269,9 +282,131 @@ INIT_XMM
%define BUTTERFLY BUTTERFLY_SSE
%define BUTTERFLY0 BUTTERFLY0_SSE
%ifdef ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
%macro PASS5 0
nop
; FIXME code alignment
SWAP
5
,
8
SWAP
4
,
12
SWAP
6
,
14
SWAP
7
,
13
SWAP
0
,
15
PERMUTE
9
,
10
,
10
,
12
,
11
,
14
,
12
,
9
,
13
,
11
,
14
,
13
TRANSPOSE4x4PS
8
,
9
,
10
,
11
,
0
BUTTERFLY3V
8
,
9
,
10
,
11
,
0
addps
m10
,
m11
TRANSPOSE4x4PS
12
,
13
,
14
,
15
,
0
BUTTERFLY3V
12
,
13
,
14
,
15
,
0
addps
m14
,
m15
addps
m12
,
m14
addps
m14
,
m13
addps
m13
,
m15
%endmacro
%macro PASS6 0
SWAP
9
,
12
SWAP
11
,
14
movss
[
outq
+
0x00
],
m8
pshuflw
m0
,
m8
,
0xe
movss
[
outq
+
0x10
],
m9
pshuflw
m1
,
m9
,
0xe
movss
[
outq
+
0x20
],
m10
pshuflw
m2
,
m10
,
0xe
movss
[
outq
+
0x30
],
m11
pshuflw
m3
,
m11
,
0xe
movss
[
outq
+
0x40
],
m12
pshuflw
m4
,
m12
,
0xe
movss
[
outq
+
0x50
],
m13
pshuflw
m5
,
m13
,
0xe
movss
[
outq
+
0x60
],
m14
pshuflw
m6
,
m14
,
0xe
movaps
[
outq
+
0x70
],
m15
pshuflw
m7
,
m15
,
0xe
addss
m0
,
m1
addss
m1
,
m2
movss
[
outq
+
0x08
],
m0
addss
m2
,
m3
movss
[
outq
+
0x18
],
m1
addss
m3
,
m4
movss
[
outq
+
0x28
],
m2
addss
m4
,
m5
movss
[
outq
+
0x38
],
m3
addss
m5
,
m6
movss
[
outq
+
0x48
],
m4
addss
m6
,
m7
movss
[
outq
+
0x58
],
m5
movss
[
outq
+
0x68
],
m6
movss
[
outq
+
0x78
],
m7
PERMUTE
1
,
8
,
3
,
9
,
5
,
10
,
7
,
11
,
9
,
12
,
11
,
13
,
13
,
14
,
8
,
1
,
10
,
3
,
12
,
5
,
14
,
7
movhlps
m0
,
m1
pshufd
m1
,
m1
,
3
SWAP
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
SWAP
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
%rep 7
movhlps
m0
,
m1
pshufd
m1
,
m1
,
3
addss
m15
,
m1
SWAP
0
,
2
,
4
,
6
,
8
,
10
,
12
,
14
SWAP
1
,
3
,
5
,
7
,
9
,
11
,
13
,
15
%endrep
%assign i 4
%rep 15
addss
m0
,
m1
movss
[
outq
+
i
],
m0
SWAP
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
%assign i i+8
%endrep
%endmacro
%else
; ARCH_X86_32
%macro SPILL 2
; xmm#, mempos
movaps
[
outq
+
(
%
2
-
8
)
*
16
],
m
%
1
%endmacro
%macro UNSPILL 2
movaps
m
%
1
,
[
outq
+
(
%
2
-
8
)
*
16
]
%endmacro
%define PASS6 PASS6_AND_PERMUTE
%macro PASS5 0
movaps
m2
,
[
ps_cos_vec
+
160
]
shufps
m3
,
m3
,
0xcc
BUTTERFLY3
m5
,
m3
,
m2
,
m1
SPILL
5
,
8
UNSPILL
1
,
9
BUTTERFLY3
m1
,
m3
,
m2
,
m5
SPILL
1
,
14
BUTTERFLY3
m4
,
m3
,
m2
,
m5
SPILL
4
,
12
BUTTERFLY3
m7
,
m3
,
m2
,
m5
SPILL
7
,
13
UNSPILL
5
,
10
BUTTERFLY3
m5
,
m3
,
m2
,
m7
SPILL
5
,
10
UNSPILL
4
,
11
BUTTERFLY3
m4
,
m3
,
m2
,
m7
SPILL
4
,
11
BUTTERFLY3
m6
,
m3
,
m2
,
m7
SPILL
6
,
9
BUTTERFLY3
m0
,
m3
,
m2
,
m7
SPILL
0
,
15
%endmacro
%endif
INIT_XMM
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
cglobal
dct32_float_sse
,
2
,
3
,
8
,
out
,
in
,
tmp
cglobal
dct32_float_sse
,
2
,
3
,
16
,
out
,
in
,
tmp
; pass 1
movaps
m0
,
[
inq
+
0
]
...
...
@@ -287,8 +422,8 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 2
movaps
m2
,
[
ps_cos_vec
+
64
]
BUTTERFLY
m1
,
m4
,
m2
,
m3
movaps
[
outq
+
48
],
m
1
movaps
[
outq
+
0
],
m4
SPILL
1
,
1
1
SPILL
4
,
8
; pass 1
movaps
m1
,
[
inq
+
16
]
...
...
@@ -313,17 +448,17 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps
m2
,
[
ps_cos_vec
+
96
]
shufps
m1
,
m1
,
0x1b
BUTTERFLY
m0
,
m1
,
m2
,
m3
movaps
[
outq
+
112
],
m0
movaps
[
outq
+
96
],
m1
SPILL
0
,
15
SPILL
1
,
14
movaps
m0
,
[
outq
+
0
]
UNSPILL
0
,
8
shufps
m5
,
m5
,
0x1b
BUTTERFLY
m0
,
m5
,
m2
,
m3
movaps
m1
,
[
outq
+
48
]
UNSPILL
1
,
11
shufps
m6
,
m6
,
0x1b
BUTTERFLY
m1
,
m6
,
m2
,
m3
movaps
[
outq
+
48
],
m
1
SPILL
1
,
1
1
shufps
m4
,
m4
,
0x1b
BUTTERFLY
m7
,
m4
,
m2
,
m3
...
...
@@ -335,57 +470,25 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
BUTTERFLY2
m5
,
m3
,
m2
,
m1
BUTTERFLY2
m0
,
m3
,
m2
,
m1
movaps
[
outq
+
16
],
m0
SPILL
0
,
9
BUTTERFLY2
m6
,
m3
,
m2
,
m1
movaps
[
outq
+
32
],
m6
SPILL
6
,
10
movaps
m0
,
[
outq
+
48
]
UNSPILL
0
,
11
BUTTERFLY2
m0
,
m3
,
m2
,
m1
movaps
[
outq
+
48
],
m0
SPILL
0
,
11
BUTTERFLY2
m4
,
m3
,
m2
,
m1
BUTTERFLY2
m7
,
m3
,
m2
,
m1
movaps
m6
,
[
outq
+
96
]
UNSPILL
6
,
14
BUTTERFLY2
m6
,
m3
,
m2
,
m1
movaps
m0
,
[
outq
+
112
]
UNSPILL
0
,
15
BUTTERFLY2
m0
,
m3
,
m2
,
m1
; pass 5
movaps
m2
,
[
ps_cos_vec
+
160
]
shufps
m3
,
m3
,
0xcc
BUTTERFLY3
m5
,
m3
,
m2
,
m1
movaps
[
outq
+
0
],
m5
movaps
m1
,
[
outq
+
16
]
BUTTERFLY3
m1
,
m3
,
m2
,
m5
movaps
[
outq
+
96
],
m1
BUTTERFLY3
m4
,
m3
,
m2
,
m5
movaps
[
outq
+
64
],
m4
BUTTERFLY3
m7
,
m3
,
m2
,
m5
movaps
[
outq
+
80
],
m7
movaps
m5
,
[
outq
+
32
]
BUTTERFLY3
m5
,
m3
,
m2
,
m7
movaps
[
outq
+
32
],
m5
movaps
m4
,
[
outq
+
48
]
BUTTERFLY3
m4
,
m3
,
m2
,
m7
movaps
[
outq
+
48
],
m4
BUTTERFLY3
m6
,
m3
,
m2
,
m7
movaps
[
outq
+
16
],
m6
BUTTERFLY3
m0
,
m3
,
m2
,
m7
movaps
[
outq
+
112
],
m0
; pass 6, no SIMD...
PASS6_AND_PERMUTE
PASS5
PASS6
RET
libavcodec/x86/fmtconvert.asm
浏览文件 @
422b2362
...
...
@@ -95,13 +95,6 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
%macro BUTTERFLYPS 3
movaps
m
%
3
,
m
%
1
unpcklps
m
%
1
,
m
%
2
unpckhps
m
%
3
,
m
%
2
SWAP
%
2
,
%
3
%endmacro
%macro FLOAT_INTERLEAVE6 2
cglobal
float_interleave6_
%
1
,
2
,
7
,
%
2
,
ds
t
,
src
,
src1
,
src2
,
src3
,
src4
,
src5
%ifdef ARCH_X86_64
...
...
@@ -130,9 +123,9 @@ cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
movaps
m4
,
[
srcq
+
src4q
]
movaps
m5
,
[
srcq
+
src5q
]
BUTTERFLYPS
0
,
1
,
6
BUTTERFLYPS
2
,
3
,
6
BUTTERFLYPS
4
,
5
,
6
S
BUTTERFLYPS
0
,
1
,
6
S
BUTTERFLYPS
2
,
3
,
6
S
BUTTERFLYPS
4
,
5
,
6
movaps
m6
,
m4
shufps
m4
,
m0
,
0xe4
...
...
libavcodec/x86/x86util.asm
浏览文件 @
422b2362
...
...
@@ -41,6 +41,13 @@
SWAP
%
2
,
%
4
,
%
3
%endmacro
%macro SBUTTERFLYPS 3
movaps
m
%
3
,
m
%
1
unpcklps
m
%
1
,
m
%
2
unpckhps
m
%
3
,
m
%
2
SWAP
%
2
,
%
3
%endmacro
%macro TRANSPOSE4x4B 5
SBUTTERFLY
bw
,
%
1
,
%
2
,
%
5
SBUTTERFLY
bw
,
%
3
,
%
4
,
%
5
...
...
@@ -74,6 +81,19 @@
SWAP
%
2
,
%
3
%endmacro
; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
%macro TRANSPOSE4x4PS 5
SBUTTERFLYPS
%
1
,
%
2
,
%
5
SBUTTERFLYPS
%
3
,
%
4
,
%
5
movaps
m
%
5
,
m
%
1
movlhps
m
%
1
,
m
%
3
movhlps
m
%
3
,
m
%
5
movaps
m
%
5
,
m
%
2
movlhps
m
%
2
,
m
%
4
movhlps
m
%
4
,
m
%
5
SWAP
%
2
,
%
3
%endmacro
%macro TRANSPOSE8x8W 9-11
%ifdef ARCH_X86_64
SBUTTERFLY
wd
,
%
1
,
%
2
,
%
9
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录