Add support for Unicode-aware regular expressions in Monaco Monarch language definitions

4b00bed1 · Daniel Kelling · a1ed3861 · 4b00bed1 · 4b00bed1 · 4b00bed1
5 changed file
--- a/src/vs/editor/standalone/common/monarch/monarchCommon.ts
+++ b/src/vs/editor/standalone/common/monarch/monarchCommon.ts
@@ -24,6 +24,7 @@ export interface ILexerMin {
 	languageId: string;
 	noThrow: boolean;
 	ignoreCase: boolean;
+	unicode: boolean;
 	usesEmbedded: boolean;
 	defaultToken: string;
 	stateNames: { [stateName: string]: any; };
@@ -34,6 +35,7 @@ export interface ILexer extends ILexerMin {
 	maxStack: number;
 	start: string | null;
 	ignoreCase: boolean;
+	unicode: boolean;
 	tokenPostfix: string;

 	tokenizer: { [stateName: string]: IRule[]; };

--- a/src/vs/editor/standalone/common/monarch/monarchCompile.ts
+++ b/src/vs/editor/standalone/common/monarch/monarchCompile.ts
@@ -79,7 +79,7 @@ function createKeywordMatcher(arr: string[], caseInsensitive: boolean = false):
 // Lexer helpers

 /**
- * Compiles a regular expression string, adding the 'i' flag if 'ignoreCase' is set.
+ * Compiles a regular expression string, adding the 'i' flag if 'ignoreCase' is set, and the 'u' flag if 'unicode' is set.
 * Also replaces @\w+ or sequences with the content of the specified attribute
 */
 function compileRegExp(lexer: monarchCommon.ILexerMin, str: string): RegExp {
@@ -103,7 +103,8 @@ function compileRegExp(lexer: monarchCommon.ILexerMin, str: string): RegExp {
 		});
 	}

-	return new RegExp(str, (lexer.ignoreCase ? 'i' : ''));
+	let flags = (lexer.ignoreCase ? 'i' : '') + (lexer.unicode ? 'u' : '');
+	return new RegExp(str, flags);
 }

 /**
@@ -400,6 +401,7 @@ export function compile(languageId: string, json: IMonarchLanguage): monarchComm
 	// Set standard fields: be defensive about types
 	lexer.start = (typeof json.start === 'string' ? json.start : null);
 	lexer.ignoreCase = bool(json.ignoreCase, false);
+	lexer.unicode = bool(json.unicode, false);

 	lexer.tokenPostfix = string(json.tokenPostfix, '.' + lexer.languageId);
 	lexer.defaultToken = string(json.defaultToken, 'source');
@@ -410,6 +412,7 @@ export function compile(languageId: string, json: IMonarchLanguage): monarchComm
 	let lexerMin: monarchCommon.ILexerMin = <any>json;
 	lexerMin.languageId = languageId;
 	lexerMin.ignoreCase = lexer.ignoreCase;
+	lexerMin.unicode = lexer.unicode;
 	lexerMin.noThrow = lexer.noThrow;
 	lexerMin.usesEmbedded = lexer.usesEmbedded;
 	lexerMin.stateNames = json.tokenizer;

--- a/src/vs/editor/standalone/common/monarch/monarchLexer.ts
+++ b/src/vs/editor/standalone/common/monarch/monarchLexer.ts
@@ -497,7 +497,8 @@ export class MonarchTokenizer implements modes.ITokenizationSupport {
 			let regex = rule.regex;
 			let regexSource = rule.regex.source;
 			if (regexSource.substr(0, 4) === '^(?:' && regexSource.substr(regexSource.length - 1, 1) === ')') {
-				regex = new RegExp(regexSource.substr(4, regexSource.length - 5), regex.ignoreCase ? 'i' : '');
+				let flags = (regex.ignoreCase ? 'i' : '') + (regex.unicode ? 'u' : '');
+				regex = new RegExp(regexSource.substr(4, regexSource.length - 5), flags);
 			}

 			let result = line.search(regex);

--- a/src/vs/editor/standalone/common/monarch/monarchTypes.ts
+++ b/src/vs/editor/standalone/common/monarch/monarchTypes.ts
@@ -21,6 +21,10 @@ export interface IMonarchLanguage {
 	 * is the language case insensitive?
 	 */
 	ignoreCase?: boolean;
+	/**
+	 * is the language unicode-aware? (i.e., /\u{1D306}/)
+	 */
+	unicode?: boolean;
 	/**
 	 * if no match in the tokenizer assign this token class (default 'source')
 	 */

--- a/src/vs/monaco.d.ts
+++ b/src/vs/monaco.d.ts
@@ -6286,6 +6286,10 @@ declare namespace monaco.languages {
 		 * is the language case insensitive?
 		 */
 		ignoreCase?: boolean;
+		/**
+		 * is the language unicode-aware? (i.e., /\u{1D306}/)
+		 */
+		unicode?: boolean;
 		/**
 		 * if no match in the tokenizer assign this token class (default 'source')
 		 */