match.asm 10.5 KB
Newer Older
M
Mark Adler 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408

; match.asm -- Pentium-Pro optimized version of longest_match()
;
; Updated for zlib 1.1.3 and converted to MASM 6.1x
; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>
;                    and Chuck Walbourn <chuckw@kinesoft.com>
; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>
;
; This is free software; you can redistribute it and/or modify it
; under the terms of the GNU General Public License.

; Based on match.S
; Written for zlib 1.1.2
; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>

	.686P
	.MODEL	FLAT

;===========================================================================
; EQUATES
;===========================================================================

MAX_MATCH	EQU 258
MIN_MATCH	EQU 3
MIN_LOOKAHEAD	EQU (MAX_MATCH + MIN_MATCH + 1)
MAX_MATCH_8	EQU ((MAX_MATCH + 7) AND (NOT 7))

;===========================================================================
; STRUCTURES
;===========================================================================

; This STRUCT assumes a 4-byte alignment

DEFLATE_STATE	STRUCT
ds_strm			dd ?
ds_status		dd ?
ds_pending_buf		dd ?
ds_pending_buf_size	dd ?
ds_pending_out		dd ?
ds_pending		dd ?
ds_wrap			dd ?
ds_data_type		db ?
ds_method		db ?
			db ?	; padding
			db ?	; padding
ds_last_flush		dd ?
ds_w_size		dd ?	; used
ds_w_bits		dd ?
ds_w_mask		dd ?	; used
ds_window		dd ?	; used
ds_window_size		dd ?
ds_prev			dd ?	; used
ds_head			dd ?
ds_ins_h		dd ?
ds_hash_size		dd ?
ds_hash_bits		dd ?
ds_hash_mask		dd ?
ds_hash_shift		dd ?
ds_block_start		dd ?
ds_match_length		dd ?	; used
ds_prev_match		dd ?	; used
ds_match_available	dd ?
ds_strstart		dd ?	; used
ds_match_start		dd ?	; used
ds_lookahead		dd ?	; used
ds_prev_length		dd ?	; used
ds_max_chain_length	dd ?	; used
ds_max_laxy_match	dd ?
ds_level		dd ?
ds_strategy		dd ?
ds_good_match		dd ?	; used
ds_nice_match		dd ?	; used

; Don't need anymore of the struct for match
DEFLATE_STATE	ENDS

;===========================================================================
; CODE
;===========================================================================
_TEXT	SEGMENT

;---------------------------------------------------------------------------
; match_init
;---------------------------------------------------------------------------
	ALIGN	4
PUBLIC	_match_init
_match_init	PROC
	; no initialization needed
	ret
_match_init	ENDP

;---------------------------------------------------------------------------
; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
;---------------------------------------------------------------------------
	ALIGN	4

PUBLIC	_longest_match
_longest_match	PROC

; Since this code uses EBP for a scratch register, the stack frame must
; be manually constructed and referenced relative to the ESP register.

; Stack image
; Variables
chainlenwmask	=  0	; high word: current chain len
			; low word: s->wmask
window		=  4	; local copy of s->window
windowbestlen	=  8	; s->window + bestlen
scanend		= 12	; last two bytes of string
scanstart	= 16	; first two bytes of string
scanalign	= 20	; dword-misalignment of string
nicematch	= 24	; a good enough match size
bestlen		= 28	; size of best match so far
scan		= 32	; ptr to string wanting match
varsize		= 36	; number of bytes (also offset to last saved register)

; Saved Registers (actually pushed into place)
ebx_save	= 36
edi_save	= 40
esi_save	= 44
ebp_save	= 48

; Parameters
retaddr		= 52
deflatestate	= 56
curmatch	= 60

; Save registers that the compiler may be using
	push	ebp
	push	edi
	push	esi
	push	ebx

; Allocate local variable space
	sub	esp,varsize

; Retrieve the function arguments. ecx will hold cur_match
; throughout the entire function. edx will hold the pointer to the
; deflate_state structure during the function's setup (before
; entering the main loop).

	mov	edx, [esp+deflatestate]
ASSUME	edx:PTR DEFLATE_STATE

	mov	ecx, [esp+curmatch]

; uInt wmask = s->w_mask;
; unsigned chain_length = s->max_chain_length;
; if (s->prev_length >= s->good_match) {
;     chain_length >>= 2;
; }

	mov	eax, [edx].ds_prev_length
	mov	ebx, [edx].ds_good_match
	cmp	eax, ebx
	mov	eax, [edx].ds_w_mask
	mov	ebx, [edx].ds_max_chain_length
	jl	SHORT LastMatchGood
	shr	ebx, 2
LastMatchGood:

; chainlen is decremented once beforehand so that the function can
; use the sign flag instead of the zero flag for the exit test.
; It is then shifted into the high word, to make room for the wmask
; value, which it will always accompany.

	dec	ebx
	shl	ebx, 16
	or	ebx, eax
	mov	[esp+chainlenwmask], ebx

; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;

	mov	eax, [edx].ds_nice_match
	mov	ebx, [edx].ds_lookahead
	cmp	ebx, eax
	jl	SHORT LookaheadLess
	mov	ebx, eax
LookaheadLess:
	mov	[esp+nicematch], ebx

;/* register Bytef *scan = s->window + s->strstart;                     */

	mov	esi, [edx].ds_window
	mov	[esp+window], esi
	mov	ebp, [edx].ds_strstart
	lea	edi, [esi+ebp]
	mov	[esp+scan],edi

;/* Determine how many bytes the scan ptr is off from being             */
;/* dword-aligned.                                                      */

	mov	eax, edi
	neg	eax
	and	eax, 3
	mov	[esp+scanalign], eax

;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ?                      */
;/*     s->strstart - (IPos)MAX_DIST(s) : NIL;                          */

	mov	eax, [edx].ds_w_size
	sub	eax, MIN_LOOKAHEAD
	sub	ebp, eax
	jg	SHORT LimitPositive
	xor	ebp, ebp
LimitPositive:

;/* int best_len = s->prev_length;                                      */

	mov	eax, [edx].ds_prev_length
	mov	[esp+bestlen], eax

;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */

	add	esi, eax
	mov	[esp+windowbestlen], esi

;/* register ush scan_start = *(ushf*)scan;                             */
;/* register ush scan_end   = *(ushf*)(scan+best_len-1);                */
;/* Posf *prev = s->prev;                                               */

	movzx	ebx, WORD PTR[edi]
	mov	[esp+scanstart], ebx
	movzx	ebx, WORD PTR[eax+edi-1]
	mov	[esp+scanend], ebx
	mov	edi, [edx].ds_prev

;/* Jump into the main loop.                                            */

	mov	edx, [esp+chainlenwmask]
	jmp	SHORT LoopEntry

;/* do {
; *     match = s->window + cur_match;
; *     if (*(ushf*)(match+best_len-1) != scan_end ||
; *         *(ushf*)match != scan_start) continue;
; *     [...]
; * } while ((cur_match = prev[cur_match & wmask]) > limit
; *          && --chain_length != 0);
; *
; * Here is the inner loop of the function. The function will spend the
; * majority of its time in this loop, and majority of that time will
; * be spent in the first ten instructions.
; *
; * Within this loop:
; * %ebx = scanend
; * %ecx = curmatch
; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
; * %esi = windowbestlen - i.e., (window + bestlen)
; * %edi = prev
; * %ebp = limit
; */

	ALIGN	4
LookupLoop:
	and	ecx, edx
	movzx	ecx, WORD PTR[edi+ecx*2]
	cmp	ecx, ebp
	jbe	LeaveNow
	sub	edx, 000010000H
	js	LeaveNow

LoopEntry:
	movzx	eax, WORD PTR[esi+ecx-1]
	cmp	eax, ebx
	jnz	SHORT LookupLoop

	mov	eax, [esp+window]
	movzx	eax, WORD PTR[eax+ecx]
	cmp	eax, [esp+scanstart]
	jnz	SHORT LookupLoop

;/* Store the current value of chainlen.                                */

	mov	[esp+chainlenwmask], edx

;/* Point %edi to the string under scrutiny, and %esi to the string we  */
;/* are hoping to match it up with. In actuality, %esi and %edi are     */
;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is     */
;/* initialized to -(MAX_MATCH_8 - scanalign).                          */

	mov	esi, [esp+window]
	mov	edi, [esp+scan]
	add	esi, ecx
	mov	eax, [esp+scanalign]
	mov	edx, -MAX_MATCH_8
	lea	edi, [edi+eax+MAX_MATCH_8]
	lea	esi, [esi+eax+MAX_MATCH_8]

;/* Test the strings for equality, 8 bytes at a time. At the end,
; * adjust %edx so that it is offset to the exact byte that mismatched.
; *
; * We already know at this point that the first three bytes of the
; * strings match each other, and they can be safely passed over before
; * starting the compare loop. So what this code does is skip over 0-3
; * bytes, as much as necessary in order to dword-align the %edi
; * pointer. (%esi will still be misaligned three times out of four.)
; *
; * It should be confessed that this loop usually does not represent
; * much of the total running time. Replacing it with a more
; * straightforward "rep cmpsb" would not drastically degrade
; * performance.
; */

LoopCmps:
	mov	eax, DWORD PTR[esi+edx]
	xor	eax, DWORD PTR[edi+edx]
	jnz	SHORT LeaveLoopCmps

	mov	eax, DWORD PTR[esi+edx+4]
	xor	eax, DWORD PTR[edi+edx+4]
	jnz	SHORT LeaveLoopCmps4

	add	edx, 8
	jnz	SHORT LoopCmps
	jmp	LenMaximum
	ALIGN	4

LeaveLoopCmps4:
	add	edx, 4

LeaveLoopCmps:
	test	eax, 00000FFFFH
	jnz	SHORT LenLower

	add	edx, 2
	shr	eax, 16

LenLower:
	sub	al, 1
	adc	edx, 0

;/* Calculate the length of the match. If it is longer than MAX_MATCH,  */
;/* then automatically accept it as the best possible match and leave.  */

	lea	eax, [edi+edx]
	mov	edi, [esp+scan]
	sub	eax, edi
	cmp	eax, MAX_MATCH
	jge	SHORT LenMaximum

;/* If the length of the match is not longer than the best match we     */
;/* have so far, then forget it and return to the lookup loop.          */

	mov	edx, [esp+deflatestate]
	mov	ebx, [esp+bestlen]
	cmp	eax, ebx
	jg	SHORT LongerMatch
	mov	esi, [esp+windowbestlen]
	mov	edi, [edx].ds_prev
	mov	ebx, [esp+scanend]
	mov	edx, [esp+chainlenwmask]
	jmp	LookupLoop
	ALIGN	4

;/*         s->match_start = cur_match;                                 */
;/*         best_len = len;                                             */
;/*         if (len >= nice_match) break;                               */
;/*         scan_end = *(ushf*)(scan+best_len-1);                       */

LongerMatch:
	mov	ebx, [esp+nicematch]
	mov	[esp+bestlen], eax
	mov	[edx].ds_match_start, ecx
	cmp	eax, ebx
	jge	SHORT LeaveNow
	mov	esi, [esp+window]
	add	esi, eax
	mov	[esp+windowbestlen], esi
	movzx	ebx, WORD PTR[edi+eax-1]
	mov	edi, [edx].ds_prev
	mov	[esp+scanend], ebx
	mov	edx, [esp+chainlenwmask]
	jmp	LookupLoop
	ALIGN	4

;/* Accept the current string, with the maximum possible length.        */

LenMaximum:
	mov	edx, [esp+deflatestate]
	mov	DWORD PTR[esp+bestlen], MAX_MATCH
	mov	[edx].ds_match_start, ecx

;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len;          */
;/* return s->lookahead;                                                */

LeaveNow:
	mov	edx, [esp+deflatestate]
	mov	ebx, [esp+bestlen]
	mov	eax, [edx].ds_lookahead
	cmp	ebx, eax
	jg	SHORT LookaheadRet
	mov	eax, ebx
LookaheadRet:

; Restore the stack and return from whence we came.

	add	esp, varsize
	pop	ebx
	pop	esi
	pop	edi
	pop	ebp
	ret

_longest_match	ENDP

_TEXT	ENDS
END