Merge pull request #114269 from microsoft/alex/fuzzy-score-improvements

Fuzzy score improvements

Merge pull request #114269 from microsoft/alex/fuzzy-score-improvements
Fuzzy score improvements
c4d5b055 · Johannes Rieken · GitHub · d4f993de · cc8c9a22 · c4d5b055
3 changed file
--- a/src/vs/base/common/filters.ts
+++ b/src/vs/base/common/filters.ts
@@ -418,20 +418,28 @@ const _maxLen = 128;

 function initTable() {
 	const table: number[][] = [];
-	const row: number[] = [0];
-	for (let i = 1; i <= _maxLen; i++) {
-		row.push(-i);
+	const row: number[] = [];
+	for (let i = 0; i <= _maxLen; i++) {
+		row[i] = 0;
 	}
 	for (let i = 0; i <= _maxLen; i++) {
-		const thisRow = row.slice(0);
-		thisRow[0] = -i;
-		table.push(thisRow);
+		table.push(row.slice(0));
 	}
 	return table;
 }

+function initArr(maxLen: number) {
+	const row: number[] = [];
+	for (let i = 0; i <= maxLen; i++) {
+		row[i] = 0;
+	}
+	return row;
+}
+
+const _minWordMatchPos = initArr(2 * _maxLen); // min word position for a certain pattern position
+const _maxWordMatchPos = initArr(2 * _maxLen); // max word position for a certain pattern position
+const _diag = initTable(); // the length of a contiguous diagonal match
 const _table = initTable();
-const _scores = initTable();
 const _arrows = <Arrow[][]>initTable();
 const _debug = false;

@@ -460,7 +468,7 @@ function printTables(pattern: string, patternStart: number, word: string, wordSt
 	word = word.substr(wordStart);
 	console.log(printTable(_table, pattern, pattern.length, word, word.length));
 	console.log(printTable(_arrows, pattern, pattern.length, word, word.length));
-	console.log(printTable(_scores, pattern, pattern.length, word, word.length));
+	console.log(printTable(_diag, pattern, pattern.length, word, word.length));
 }

 function isSeparatorAtPos(value: string, index: number): boolean {
@@ -511,9 +519,13 @@ function isUpperCaseAtPos(pos: number, word: string, wordLow: string): boolean {
 	return word[pos] !== wordLow[pos];
 }

-export function isPatternInWord(patternLow: string, patternPos: number, patternLen: number, wordLow: string, wordPos: number, wordLen: number): boolean {
+export function isPatternInWord(patternLow: string, patternPos: number, patternLen: number, wordLow: string, wordPos: number, wordLen: number, fillMinWordPosArr = false): boolean {
 	while (patternPos < patternLen && wordPos < wordLen) {
 		if (patternLow[patternPos] === wordLow[wordPos]) {
+			if (fillMinWordPosArr) {
+				// Remember the min word position for each pattern position
+				_minWordMatchPos[patternPos] = wordPos;
+			}
 			patternPos += 1;
 		}
 		wordPos += 1;
@@ -521,7 +533,7 @@ export function isPatternInWord(patternLow: string, patternPos: number, patternL
 	return patternPos === patternLen; // pattern must be exhausted
 }

-const enum Arrow { Top = 0b1, Diag = 0b10, Left = 0b100 }
+const enum Arrow { Diag = 1, Left = 2, LeftLeft = 3 }

 /**
 * A tuple of three values.
@@ -558,10 +570,24 @@ export function fuzzyScore(pattern: string, patternLow: string, patternStart: nu
 	// Run a simple check if the characters of pattern occur
 	// (in order) at all in word. If that isn't the case we
 	// stop because no match will be possible
-	if (!isPatternInWord(patternLow, patternStart, patternLen, wordLow, wordStart, wordLen)) {
+	if (!isPatternInWord(patternLow, patternStart, patternLen, wordLow, wordStart, wordLen, true)) {
 		return undefined;
 	}

+	// Find the max matching word position for each pattern position
+	// NOTE: the min matching word position was filled in above, in the `isPatternInWord` call
+	{
+		let patternPos = patternLen - 1;
+		let wordPos = wordLen - 1;
+		while (patternPos >= patternStart && wordPos >= wordStart) {
+			if (patternLow[patternPos] === wordLow[wordPos]) {
+				_maxWordMatchPos[patternPos] = wordPos;
+				patternPos--;
+			}
+			wordPos--;
+		}
+	}
+
 	let row: number = 1;
 	let column: number = 1;
 	let patternPos = patternStart;
@@ -572,45 +598,80 @@ export function fuzzyScore(pattern: string, patternLow: string, patternStart: nu
 	// There will be a match, fill in tables
 	for (row = 1, patternPos = patternStart; patternPos < patternLen; row++, patternPos++) {

-		for (column = 1, wordPos = wordStart; wordPos < wordLen; column++, wordPos++) {
+		// Reduce search space to possible matching word positions and to possible access from next row
+		const minWordMatchPos = _minWordMatchPos[patternPos];
+		const maxWordMatchPos = _maxWordMatchPos[patternPos];
+		const nextMaxWordMatchPos = (patternPos + 1 < patternLen ? _maxWordMatchPos[patternPos + 1] : wordLen);
+
+		for (column = minWordMatchPos - wordStart + 1, wordPos = minWordMatchPos; wordPos < nextMaxWordMatchPos; column++, wordPos++) {

-			const score = _doScore(pattern, patternLow, patternPos, patternStart, word, wordLow, wordPos);
+			const score = (wordPos > maxWordMatchPos ? -1 : _doScore(pattern, patternLow, patternPos, patternStart, word, wordLow, wordPos));

 			if (patternPos === patternStart && score > 1) {
 				hasStrongFirstMatch = true;
 			}

-			_scores[row][column] = score;
+			const canComeDiag = (score > 0);
+			let diagScore = 0;
+			if (canComeDiag) {
+				diagScore = score;

-			const diag = _table[row - 1][column - 1] + (score > 1 ? 1 : score);
-			const top = _table[row - 1][column] + -1;
-			const left = _table[row][column - 1] + -1;
+				// Having a gap in the word match is penalized less if the gap occurs around natural boundaries e.g. aA, _a, .a
+				const isNaturalGapLocation = (
+					isUpperCaseAtPos(wordPos, word, wordLow)
+					|| isSeparatorAtPos(wordLow, wordPos - 1)
+					|| isWhitespaceAtPos(wordLow, wordPos - 1)
+				);

-			if (left >= top) {
-				// left or diag
-				if (left > diag) {
-					_table[row][column] = left;
-					_arrows[row][column] = Arrow.Left;
-				} else if (left === diag) {
-					_table[row][column] = left;
-					_arrows[row][column] = Arrow.Left | Arrow.Diag;
+				if (row === 1) {
+					// first character in pattern
+					if (column > 1) {
+						// the first pattern character would match a word character that is not at the word start
+						// so introduce a penalty to account for the gap preceding this match
+						diagScore += -5 + (isNaturalGapLocation ? 2 : 0);
+					}
 				} else {
-					_table[row][column] = diag;
-					_arrows[row][column] = Arrow.Diag;
+					// column is guaranteed to be > 1 because we must have consumed at least one word character with the first row
+					diagScore += _table[row - 1][column - 1];
+					if (_diag[row - 1][column - 1] === 0) {
+						// this would be the beginning of a new match (i.e. there would be a gap before this location)
+						diagScore += (isNaturalGapLocation ? 2 : 0);
+					} else {
+						// this is part of a contiguous match, so give it a slight bonus, but do so only if it would not be a prefered gap location
+						diagScore += (isNaturalGapLocation ? 0 : 1);
+					}
 				}
-			} else {
-				// top or diag
-				if (top > diag) {
-					_table[row][column] = top;
-					_arrows[row][column] = Arrow.Top;
-				} else if (top === diag) {
-					_table[row][column] = top;
-					_arrows[row][column] = Arrow.Top | Arrow.Diag;
-				} else {
-					_table[row][column] = diag;
-					_arrows[row][column] = Arrow.Diag;
+
+				if (wordPos + 1 === wordLen) {
+					// we always penalize gaps, but this gives unfair advantages to a match that would match the last character in the word
+					// so pretend there is a gap after the last character in the word to normalize things
+					diagScore += -5 + (isNaturalGapLocation ? 2 : 0);
 				}
 			}
+
+			const canComeLeft = (wordPos > minWordMatchPos);
+			const leftScore = (canComeLeft ? _table[row][column - 1] + (_diag[row][column - 1] > 0 ? -5 : 0) : 0); // penalty for a gap start
+
+			const canComeLeftLeft = (wordPos > minWordMatchPos + 1 && _diag[row][column - 1] > 0);
+			const leftLeftScore = (canComeLeftLeft ? _table[row][column - 2] + (_diag[row][column - 2] > 0 ? -5 : 0) : 0); // penalty for a gap start
+
+			if (canComeLeftLeft && (!canComeLeft || leftLeftScore >= leftScore) && (!canComeDiag || leftLeftScore >= diagScore)) {
+				// always prefer choosing left left to jump over a diagonal because that means a match is earlier in the word
+				_table[row][column] = leftLeftScore;
+				_arrows[row][column] = Arrow.LeftLeft;
+				_diag[row][column] = 0;
+			} else if (canComeLeft && (!canComeDiag || leftScore >= diagScore)) {
+				// always prefer choosing left since that means a match is earlier in the word
+				_table[row][column] = leftScore;
+				_arrows[row][column] = Arrow.Left;
+				_diag[row][column] = 0;
+			} else if (canComeDiag) {
+				_table[row][column] = diagScore;
+				_arrows[row][column] = Arrow.Diag;
+				_diag[row][column] = _diag[row - 1][column - 1] + 1;
+			} else {
+				throw new Error(`not possible`);
+			}
 		}
 	}

@@ -622,17 +683,68 @@ export function fuzzyScore(pattern: string, patternLow: string, patternStart: nu
 		return undefined;
 	}

-	_matchesCount = 0;
-	_topScore = -100;
-	_wordStart = wordStart;
-	_firstMatchCanBeWeak = firstMatchCanBeWeak;
+	row--;
+	column--;

-	_findAllMatches2(row - 1, column - 1, patternLen === wordLen ? 1 : 0, 0, false);
-	if (_matchesCount === 0) {
-		return undefined;
+	const topScore = _table[row][column];
+	let matches = 0;
+	let backwardsDiagLength = 0;
+	let maxMatchColumn = 0;
+
+	while (row >= 1) {
+		// Find the column where we go diagonally up
+		let diagColumn = column;
+		do {
+			const arrow = _arrows[row][diagColumn];
+			if (arrow === Arrow.LeftLeft) {
+				diagColumn = diagColumn - 2;
+			} else if (arrow === Arrow.Left) {
+				diagColumn = diagColumn - 1;
+			} else {
+				// found the diagonal
+				break;
+			}
+		} while (diagColumn >= 1);
+
+		// Overturn the "forwards" decision if keeping the "backwards" diagonal would give a better match
+		if (
+			backwardsDiagLength > 1 // only if we would have a contiguous match of 3 characters
+			&& patternLow[patternStart + row - 1] === wordLow[wordStart + column - 1] // only if we can do a contiguous match diagonally
+			&& !isUpperCaseAtPos(diagColumn + wordStart - 1, word, wordLow) // only if the forwards chose diagonal is not an uppercase
+			&& backwardsDiagLength + 1 > _diag[row][diagColumn] // only if our contiguous match would be longer than the "forwards" contiguous match
+		) {
+			diagColumn = column;
+		}
+
+		if (diagColumn === column) {
+			// this is a contiguous match
+			backwardsDiagLength++;
+		} else {
+			backwardsDiagLength = 1;
+		}
+
+		if (!maxMatchColumn) {
+			// remember the last matched column
+			maxMatchColumn = diagColumn;
+		}
+
+		row--;
+		column = diagColumn - 1;
+		matches += 2 ** (column + wordStart);
+	}
+
+	let finalScore = topScore;
+	if (wordLen === patternLen) {
+		// the word matches the pattern with all characters!
+		// giving the score a total match boost (to come up ahead other words)
+		finalScore += 2;
 	}

-	return [_topScore, _topMatch2, wordStart];
+	// Add 1 penalty for each skipped character in the word
+	const skippedCharsCount = maxMatchColumn - patternLen;
+	finalScore -= skippedCharsCount;
+
+	return [finalScore, matches, wordStart];
 }

 function _doScore(pattern: string, patternLow: string, patternPos: number, patternStart: number, word: string, wordLow: string, wordPos: number) {
@@ -670,94 +782,6 @@ function _doScore(pattern: string, patternLow: string, patternPos: number, patte
 	}
 }

-let _matchesCount: number = 0;
-let _topMatch2: number = 0;
-let _topScore: number = 0;
-let _wordStart: number = 0;
-let _firstMatchCanBeWeak: boolean = false;
-
-function _findAllMatches2(row: number, column: number, total: number, matches: number, lastMatched: boolean): void {
-
-	if (_matchesCount >= 10 || total < -25) {
-		// stop when having already 10 results, or
-		// when a potential alignment as already 5 gaps
-		return;
-	}
-
-	let simpleMatchCount = 0;
-
-	while (row > 0 && column > 0) {
-
-		const score = _scores[row][column];
-		const arrow = _arrows[row][column];
-
-		if (arrow === Arrow.Left) {
-			// left -> no match, skip a word character
-			column -= 1;
-			if (lastMatched) {
-				total -= 5; // new gap penalty
-			} else if (matches !== 0) {
-				total -= 1; // gap penalty after first match
-			}
-			lastMatched = false;
-			simpleMatchCount = 0;
-
-		} else if (arrow & Arrow.Diag) {
-
-			if (arrow & Arrow.Left) {
-				// left
-				_findAllMatches2(
-					row,
-					column - 1,
-					matches !== 0 ? total - 1 : total, // gap penalty after first match
-					matches,
-					lastMatched
-				);
-			}
-
-			// diag
-			total += score;
-			row -= 1;
-			column -= 1;
-			lastMatched = true;
-
-			// match -> set a 1 at the word pos
-			matches += 2 ** (column + _wordStart);
-
-			// count simple matches and boost a row of
-			// simple matches when they yield in a
-			// strong match.
-			if (score === 1) {
-				simpleMatchCount += 1;
-
-				if (row === 0 && !_firstMatchCanBeWeak) {
-					// when the first match is a weak
-					// match we discard it
-					return undefined;
-				}
-
-			} else {
-				// boost
-				total += 1 + (simpleMatchCount * (score - 1));
-				simpleMatchCount = 0;
-			}
-
-		} else {
-			return undefined;
-		}
-	}
-
-	total -= column >= 3 ? 9 : column * 3; // late start penalty
-
-	// dynamically keep track of the current top score
-	// and insert the current best score at head, the rest at tail
-	_matchesCount += 1;
-	if (total > _topScore) {
-		_topScore = total;
-		_topMatch2 = matches;
-	}
-}
-
 //#endregion



--- a/src/vs/base/test/common/filters.perf.test.ts
+++ b/src/vs/base/test/common/filters.perf.test.ts
@@ -3,7 +3,7 @@
 *  Licensed under the MIT License. See License.txt in the project root for license information.
 *--------------------------------------------------------------------------------------------*/
 import * as filters from 'vs/base/common/filters';
-import { data } from './filters.perf.data';
+import { data } from 'vs/base/test/common/filters.perf.data';

 const patterns = ['cci', 'ida', 'pos', 'CCI', 'enbled', 'callback', 'gGame', 'cons', 'zyx', 'aBc'];


--- a/src/vs/base/test/common/filters.test.ts
+++ b/src/vs/base/test/common/filters.test.ts
@@ -285,7 +285,7 @@ suite('Filters', () => {
 		assertMatches('LLLL', 'SVisualLoggerLogsList', undefined, fuzzyScore);
 		assertMatches('TEdit', 'TextEdit', '^Text^E^d^i^t', fuzzyScore);
 		assertMatches('TEdit', 'TextEditor', '^Text^E^d^i^tor', fuzzyScore);
-		assertMatches('TEdit', 'Textedit', '^T^exte^d^i^t', fuzzyScore);
+		assertMatches('TEdit', 'Textedit', '^Text^e^d^i^t', fuzzyScore);
 		assertMatches('TEdit', 'text_edit', '^text_^e^d^i^t', fuzzyScore);
 		assertMatches('TEditDit', 'TextEditorDecorationType', '^Text^E^d^i^tor^Decorat^ion^Type', fuzzyScore);
 		assertMatches('TEdit', 'TextEditorDecorationType', '^Text^E^d^i^torDecorationType', fuzzyScore);