MMIR Framework API 7.0.0-beta1


/**
 * The GrammarConverter object initializes the grammar for processing
 * <em>natural language text</em>, e.g. from the voice recognition.
 *
 * @class
 * @name GrammarConverter
 * @memberOf mmir.grammar
 *
 * @requires util/loadFile
 * @requires util/isArray
 * @requires positionUtils
 *
 * @example
 * var GrammarConverter = new mmir.require('mmirf/grammarConverter');
 * var gc = new GrammarConverter();
 */
define(['mmirf/util/isArray', 'mmirf/util/loadFile', 'mmirf/positionUtils'], function(isArray, loadFile, posUtil){



/**
 * @ignore
 *
 * @constructs mmir.grammar.GrammarConverter
 */
function GrammarConverter(){

	//regular expression for detecting encoded chars (see mask/unmask functions)
	this.enc_regexp_str = "~~([0-9|A-F|a-f]{4})~~";

	this.grammar_definition = "";
	this.js_grammar_definition = "";
	this.json_grammar_definition = null;
	this.stop_words_regexp;

	//default setting for masking value Strings in JSON values (see maskJSON() / unmaskJSON)
	this.maskValues = true;
	//default setting for masking property-name Strings in JSON values (see maskJSON() / unmaskJSON)
	// WARNING: this is actually EXPERIMENTAL; it should be set to false, since JS/CC may not be able to handle masked ID names...
	this.maskNames = false;

	//if execution of the grammar is asynchronously done (i.e. result is delivered using a callback)
	this.is_async = false;

	//list of processing steps:
	// {
	//   name: 'processing step ID',
	//   pre: function(input, pos){...},    //OPTIONAL function for pre-processing
	//   post: function(result, pos){...}}  //OPTIONAL function for post-processing
	// }
	this.procList = [];
	this.pos_order_field = '_order';
	this.initDefaultProc();
};

GrammarConverter.prototype.loadGrammar = function(successCallback, errorCallback, grammarUrl, doLoadSynchronously){
	var self = this;
	var success = function(data, _status, xhr){

		self.json_grammar_definition = data;

		if (typeof successCallback == "function") {
			successCallback.call(this, self, xhr);
		}
	};
	var error = function(_xhr, _status, data){

		if (typeof errorCallback == "function") {
			errorCallback.call(this, self);
		} else {
			console.error("failed to load the grammar! error: "+ JSON.stringify(data));
		}
	};
	this.loadResource(success, error, grammarUrl, doLoadSynchronously);
};

GrammarConverter.prototype.loadResource = function(successCallback, errorCallback, resourceUrl, doLoadSynchronously){

	var theUrl = resourceUrl;
	if(!theUrl){
		console.error('GrammarConverter.loadResource: missing URL!');
		if(errorCallback){
			errorCallback.call(this, this);
		}
		return;///////////////// EARLY EXIT //////////////////////
	}

	var isLoadAsync = false;
	if(typeof doLoadSynchronously !== 'undefined' && doLoadSynchronously === false){
		isLoadAsync = true;
	}

	loadFile({
		async: isLoadAsync,
		dataType: 'json',
		url:theUrl,
		success: successCallback,
		error: errorCallback
	});
};

GrammarConverter.prototype.setStopWords = function(stopWordArray){

	if(!this.json_grammar_definition){
		this.json_grammar_definition = {};
	}

	this.json_grammar_definition.stopwords = this.maskJSON(stopWordArray);

	this.parseStopWords();

	//use unmask-function in order to ensure masking/unmasking is reversible
	//  (or in case it is not: the error will be held in property stop_word)
	this.json_grammar_definition.stopwords = this.unmaskJSON(this.json_grammar_definition.stopwords);
};

GrammarConverter.prototype.getStopWords = function(){
	var jsonGrammar = this.json_grammar_definition;
	if(!jsonGrammar){
		return null;
	}
	var stopwords = jsonGrammar.stopwords;
	if(!stopwords && (stopwords = jsonGrammar.stop_word)){
		console.warn('GrammarConverter.getStopWords: using deprecated field stop_word for stopword-list, should use field stopwords instead!');
	}
	return stopwords;
};

/**
 * HELPER creates a copy of the stopword list and encodes all non-ASCII chars to their unicode
 *        representation (e.g. for save storage of stringified stopword list, even if file-encoding
 *        does not support non-ASCII letters).
 *
 * @returns {Array<String>} a copy of the stopword list, from the current JSON grammar
 * 							(or empty list, if no grammar is present)
 */
GrammarConverter.prototype.getEncodedStopwords = function(){
	var list = this.getStopWords();
	if(!list){
		return [];
	}

	//use copy, since recoding works in-place (we do not want to modify the stored stopword list here)
	list = list.slice(0, list.length);

	//store stopwords with their Unicode representation (only for non-ASCII chars)
	return this.recodeJSON(
			list, this.maskAsUnicode
	);
};

//this is the original / main implementation for creating the RegExp for stopword removal
GrammarConverter.prototype.parseStopWords = function(){

	//create RegExp for stop words:
	var json_stop_words = this.getStopWords();
	var size = json_stop_words.length;
	var stop_words = "";

	//FIX for encoded chars: if a word begins or ends with an encoded char, \b cannot detect the word's boundaries
	//	-> FIX if we encounter such words, create a separate RegExpr that uses
	//         whitespaces & START-/END-expression for detecting word-boundaries, i.e. something like: (\s|^)(~~ ... words ... ~~)(\s|$)
	//
	//  NOTE: the word-boundaries expression \b seems to have no effect in case of non-ASCII chars in general
	//        (e.g. for Japanese characters / words)
	//        .... so we would need to use this alternative mechanism (e.g. using whitespaces & START-/END-expr.)
	//        even if these characters were not encoded!
	var encStartTester = new RegExp("^" + this.enc_regexp_str      ,"gm");
	var encEndTester   = new RegExp(      this.enc_regexp_str + "$","gm");
	var enc_stop_words = "";
	var isEncWord = function(str){
		return encStartTester.test(str) || encEndTester.test(str);
	};


	if(size > 0){

		//... then the RegExp matches each stopword:
		for(var index=0; index < size ; ++index){
			var stop_word = json_stop_words[index];

			//special treatment for word that begin/end with encoded chars:
			if(isEncWord(stop_word)){
				if(enc_stop_words.length === 0){
					enc_stop_words = "(\\s|^)(";
				}
				else {
					enc_stop_words += "|";
				}

				enc_stop_words += stop_word;

				continue;
			}

			//... for "normal" stopwords:

			if (stop_words.length > 0){
				stop_words +=	"|";    //... if there is already a previous stopword-entry: do add OR-matching ...
			}

			stop_words +=	stop_word;  //... add the stopword "stop_word"
		}
	}

	if(stop_words.length > 0){

		stop_words =
			"\\b("								//starting at a word-boundary (-> ignore within-word matches)

				+	stop_words

				+ ")"
				+ "\\b"							//... ending with a word-boundary -> avoid "cutting out" matching partial strings
														//    e.g. without \b: '(in)\s?' would match (and cut out all matches) within "winning" -> "wng"

				+ "\\s?";				//... and optionally: one white-character that follows the stopword
	}
	else {
		//for empty stopword definition: match empty string
		//  (basically: remove nothing)
		stop_words += '^$';
	}
	this.stop_words_regexp = new RegExp(stop_words,"igm");	//RegExp options:
															// ignore-case (i),
															// match globally i.e. all occurrences in the String (g),
															// do not stop at line breaks (m)


	//only create ReExp for special stopwords, if we actually have at least 1 of those:
	//NOTE for replacement, we need to use a space-char (i.e. replace these with spaces, not empty strings: str.replace(..., ' '); )
	if(enc_stop_words.length > 0){
			enc_stop_words += ")(\\s|$)";
			this.stop_words_regexp_enc = new RegExp(enc_stop_words,"igm");
	}

};

GrammarConverter.prototype.getStopWordsRegExpr = function(){
	if(!this.stop_words_regexp){
		this.parseStopWords();
	}
	return this.stop_words_regexp;
};

/**
 * FIX for stopwords that start or end with encoded chars (i.e. non-ASCII chars)
 *
 * This RegExp may be NULL/undefined, if no stopwords exist, that begin/end with encoded chars
 * i.e. you need to check for NULL, before trying to use this RegExpr.
 *
 *  Usage:
 *  @example
 *
 *  //remove normal stopwords:
 *  var removedStopwordsStr  = someStr.replace( gc.getStopWordsRegExpr(), '');
 *
 *
 *  var removedStopwordsStr2 = removedStopwordsStr;
 *  if(gc.getStopWordsEncRegExpr()){
 *  	//NOTE replace stopwords with spaces (not with empty String as above, ie. with "normal" stopwords)
 *  	removedStopwordsStr2 = gc.getStopWordsEncRegExpr().replace( gc.getStopWordsEncRegExpr(), ' ');
 *  }
 */
GrammarConverter.prototype.getStopWordsEncRegExpr = function(){
	if(!this.stop_words_regexp){
		this.parseStopWords();
	}
	return this.stop_words_regexp_enc;
};

/**
 * Get grammar definition text.
 *
 * This is the "source code" input for the grammar compiler
 * (i.e. syntax for jison, PEG.js or JS/CC).
 *
 * The grammar definition text is generated from the JSON grammar.
 *
 * @returns {String} the grammar definition in compiler-specific syntax
 */
GrammarConverter.prototype.getGrammarDef = function(){
	return  this.grammar_definition;
};

/**
 * Sets the grammar definition text.
 *
 * This function should only be used during compilation of the JSON grammar
 * to the executable grammar.
 *
 * NOTE: Setting this "manually" will have no effect on the executable grammar.
 *
 * @see #getGrammarDef
 * @protected
 *
 * @param {String} rawGrammarSyntax
 * 		the grammar definition in compiler-specific syntax
 */
GrammarConverter.prototype.setGrammarDef = function(rawGrammarSyntax){
	this.grammar_definition = rawGrammarSyntax;
};

/**
 * Get the compiled JavaScript grammar source code.
 *
 * This is the output of the grammar compiler (with additional
 * JavaScript "framing" in {@link mmir.SemanticInterpreter#createGrammar}).
 *
 * This needs to be eval'ed before it can be executed (eval() will add
 * the corresponding executable grammar to SemanticInterpreter).
 *
 * @returns {String} the compiled, JavaScript grammar source code
 */
GrammarConverter.prototype.getGrammarSource = function(){
	return  this.js_grammar_definition;
};

GrammarConverter.prototype.setGrammarSource = function(src_code){
	this.js_grammar_definition = src_code;
};

/**
 * Set the executable grammar function.
 *
 * The grammar function takes a String argument: the text that should be parsed.
 *                            a Function argument: the callback for the result.
 *                            where the callback itself takes 1 argument for the result: <code>callback(result)</code>
 *
 * The returned result depends on the JSON definition of the grammar:
 * <code>func(inputText, resultCallback)</code>
 *
 *
 * @param {Function} func
 * 			the executable grammar function: <code>func(string, object, function(object)) : object</code>
 * @param {Boolean} [isAsnc] OPTIONAL
 * 					set to TRUE, if execution is asynchronously done.
 * 					DEFAULT: FALSE
 *
 * @see #exectueGrammar
 */
GrammarConverter.prototype.setGrammarFunction = function(func, isAsync){
	this.is_async = !!isAsync;
	this.executeGrammar = func;
};

GrammarConverter.prototype.isAsyncExec = function(){
	return this.is_async;
};

/**
 *
 * @param {String} thePhrase
 * 				the string from which to remove stopwords (and trim()'ed)
 * @param {Boolean} [computePositions] OPTIONAL
 * 				DEFAULT: false
 *
 * @returns {String|{str: String, pos: ARRAY<Position>}}
 * 				the string where stopwords were removed, or if <code>computePositions</code> was <code>true</code>
 * 				a result object where the positions at which stopwords were removed will be available as an array:
 * 				<pre>
 * 				{
 * 					text: STRING, // the string with removed stopwords
 * 					pos: [POSITION] // array of positions for removed stopwords: {i: NUMBER, len: NUMBER, mlen: NUMBER}
 * 				}
 * 				</pre>
 * 				where POSITION is an object with
 * 				<pre>
 * 				{
 * 					i: NUMBER, // the index within the modified string
 * 					len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked)
 * 					mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked)
 * 				}
 * 				</pre>
 *
 * @returns {String}
 * 				the string where stopwords were removed
 */
GrammarConverter.prototype.removeStopwords = function(thePhrase, computePositions){

	var stop_words_regexp = this.getStopWordsRegExpr();

	var str = thePhrase;

	var positions = computePositions? [] : void(0);

	var replStr,//<- replacement string used in removeFunc
		appendPos,//<- controls if position-info should append or prepended to position-list
		replOffset,//<- global offset (i.e. offset with regard to input string thePhrase)
		iCalc,//<- helper index for calculating offset in modified strings
		calcPos,//<- helper function for calculating offset in modified strings
		replPositions,//<- helper/temporary positions-array for calculating offset in modified strings
		removeFunc;//<- replacement-function that also tracks the positions that were modified (via argument positions)

	if(computePositions){

		//initialize helpers for tracking positions

		replOffset = 0;
		iCalc = 0;
		appendPos = true;

		removeFunc = function(){//HELPER for matched stopwords: log its position and remove it

			var argLen = arguments.length;
			var match = arguments[0];
			var offset = arguments[argLen-2];

			var index = calcPos(offset);

//				//FIXM DEBUG
//				var word = argLen === 4? arguments[1] : (argLen === 6? arguments[2] : 'WHITESPACE');
//				var start = index;
//				var end = start + match.length;
//				var isError = word !== 'WHITESPACE'? thePhrase.substring(start, end).trim() !== word : !/\s+/.test(thePhrase.substring(start, end));
//				console[isError? 'error' : 'log']('matched "'+match+'" -> found stopword "'+word+'" from '+start+' to '+end+ ' -> "'+thePhrase.substring(start, end)+'"');
////				console.log('    stopword-removal: ', arguments);
//				//FIXM DEBUG END

			if(appendPos){
				positions.push({i: index, mlen: match.length, len: replStr.length});
			} else {
				positions.unshift({i: index, mlen: match.length, len: replStr.length});
			}

			return replStr;
		};

		calcPos = function(offset){

			if(!replPositions){
				return offset;
			}

			var pos;
			for(var size = replPositions.length; iCalc < size; ++iCalc){
				pos = replPositions[iCalc];
				if(pos.i > offset + replOffset){
					break;
				}
				replOffset += pos.mlen - pos.len;
			}

			return offset + replOffset;
		};
	}

	var encoded_stop_words_regexp = this.getStopWordsEncRegExpr();
	replStr = ' ';
	if(encoded_stop_words_regexp){

//		console.log('_______STOPWORD-rem-enc: "'+str+'"');//FIXM DEBUG
		str = str.replace(this.stop_words_regexp_enc, computePositions? removeFunc : replStr);

		if(computePositions){
			//update helper variables for calculating global offset (after string was modified):
			replOffset = 0;
			iCalc = 0;
			replPositions = positions.slice(0);
		}
	}

//	console.log('_______STOPWORD-rem: "'+str+'"');//FIXM DEBUG

	replStr = '';
	str = str.replace(stop_words_regexp, computePositions? removeFunc : replStr);

	if(computePositions){
		positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if encoded_stop_words_regexp was applied
		//update helper variables for calculating global offset (after string was modified):
		replOffset = 0;
		iCalc = 0;
		replPositions = positions.slice(0);
	}

	if(computePositions){

		//trim with tracking of positions
//		console.log('_______STOPWORD-rem-ws: "'+str+'"');//FIXM DEBUG

		replStr = '';
		str = str.replace(/\s+$/, removeFunc);//<- trim at end

		positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if words were removed from the end of the string

		//update helper variables for calculating global offset (after string was modified):
		replOffset = 0;
		iCalc = 0;
		replPositions = positions.slice(0);

		appendPos = false;//<- prepending "start-trimming"-position may not be accurate, but should be "nearly" correct (w.r.t. to ordering by index pos.i)

		str = str.replace(/^\s+/, removeFunc);//<- trim at beginning

		positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if words were removed from the beginning of the string


//		console.log('_______STOPWORD-positions: "'+JSON.stringify(positions)+'"');//FIXM DEBUG

	} else {
		str = str.trim();
	}

//	console.log(JSON.stringify(str));//FIXM DEBUG

	return computePositions? {text: str, pos: positions} : str;
};

/**
 * Apply pre-processing to the string, before applying the grammar:
 *  * escape (i.e. "mask") non-ASCI characters
 *  * remove stopwords
 *
 * {@link #addProc} can be used to add additional pre-/post-processing steps
 *
 * @param {String} thePhrase
 * @param {PlainObject} [pos] OPTIONAL
 * 				in/out argument: if given, the pre-processor will add fields with information
 * 								 on how the input string <code>thePhrase</code> was modified
 * 				By default the position information for escaped characters and removed stopwords will be added to
 * 				<code>pos.escape</code> (see {@link #maskString} for more details)
 * 				<code>pos.stopwords</code> (see {@link #removeStopwords} for more details)
 * 				And the field <code>pos._order</code> will contain the ordered list of pre-processing steps that where applied
 * 				i.e. the enries correspond to the field names, e.g. by default the list would contain <code>['escape', 'stopwords']</code>
 * @param {Array<ProcessingStep>} [processingSteps] OPTIONAL
 * 				if given, use <code>processingSteps</code> instead of (field) <code>procList</code>
 * 				NOTE positional argument (i.e. must specify <code>pos</code> too)
 *
 *
 * @returns {String} the pre-processed string
 *
 * @see #addProc
 * @see #removeProc
 * @see #getProcIndex
 * @see #procList
 */
GrammarConverter.prototype.preproc = function(thePhrase, pos, processingSteps){

	var proc, res = thePhrase, list = processingSteps || this.procList;
	for(var i=0, size=list.length; i < size; ++i){

		proc = list[i];

		if(proc.pre){

			res = proc.pre.call(this, res, pos);

			if(pos && typeof res === 'object'){

				if(pos[this.pos_order_field]) pos[this.pos_order_field].push(proc.name);
				else pos[this.pos_order_field] = [proc.name];

				pos[proc.name] = res.pos;
			}
		}
	}

	if(typeof res === 'object'){
		if(pos){
			posUtil.recalcProcPos(pos);
		}
		return res.text;
	}

	return res;
};

/**
 * Post-processes the result from the applied grammar:
 *  * un-masks non-ASCI characters
 *
 * {@link #addProc} can be used to add additional pre-/post-processing steps
 *
 * @param {SemanticResult} procResult
 * @param {Positions} pos
 * 				the position information (i.e. modifications) of the pre-processing steps
 * @param {Array<ProcessingStep>} [processingSteps] OPTIONAL
 * 				if given, use <code>processingSteps</code> instead of (field) <code>procList</code>
 * 				NOTE positional argument (i.e. must specify <code>pos</code> too)
 *
 * @see #addProc
 * @see #removeProc
 * @see #getProcIndex
 * @see #procList
 */
GrammarConverter.prototype.postproc = function(procResult, pos, processingSteps){

	var proc, res = procResult, list = processingSteps || this.procList;
	for(var i=list.length - 1; i >= 0; --i){
		proc = list[i];
		if(proc.post){
			res = proc.post.call(this, res, pos);
		}
	}
	return res;
};

/**
 * add pre-/post-processing step for running before/after {@link #executeGrammar}
 *
 * @param  {ProcessingStep} proc the processing step:
 * 						<pre>
 * 						{
 * 							//the name of the processing step
 * 							name: string,
 * 							//OPTIONAL pre-processing function: pre(input: string | Positions, isCalcPos: boolean)
 * 							pre: Function,
 * 							//OPTIONAL post-processing function: post(result: any, pos: Positions)
 * 							post: Function
 * 						}
 * 						</pre>
 * @param  {Boolean|Number} [isPrepend] OPTIONAL
 * 						if omitted (or FALSY): appended <code>proc</code> to processing steps
 * 						if number: insert <code>proc</code> at this index into the processing steps-list
 * 						if TRUE: prepend <code>proc</code> to processing steps
 *
 * @see #removeProc
 * @see #getProcIndex
 * @see #procList
 * @see mmir.grammar.stemmer
 * @example
 * //poitionUtils:
 * var posUtil = mmir.require('mmirf/positionUtils');
 * //stemming function
 * var stemFunc = ...;
 * //add stemming function for pre-processing as first step
 * grammarConverter.addProc({
 *  name: 'stem',
 *  pre: posUtil.createWordPosPreProc(stem, this)
 * }, true);
 */
GrammarConverter.prototype.addProc = function(proc, isPrepend){
	if(proc.name === this.pos_order_field){
		throw new Error('processing step must not be named "'+this.pos_order_field+'"');
	}
	if(typeof isPrepend === 'number'){
		this.procList.splice(isPrepend, 0, proc);
	} else if(isPrepend){
		this.procList.unshift(proc);
	} else {
		this.procList.push(proc);
	}
};

/**
 * remove a processing step by its index (within {@link #procList}) or its name
 *
 * NOTE: if multiple processing steps with the same name exist, the last one is removed
 *
 * @param  {Number|String} proc the name or index of the processing step that should be removed
 * @return {ProcessingStep} the removed processing step, or undefined,
 *                          if there was no matchin processing step
 *
 * @see #addProc
 * @see #getProcIndex
 * @see #procList
 */
GrammarConverter.prototype.removeProc = function(proc){
	if(typeof proc === 'number'){
		return this.procList.splice(proc, 1)[0];
	} else {
		var i = this.getProcIndex(proc);
		if(i !== -1){
			return this.procList.splice(i, 1)[0];
		}
	}
	return void(0);
};

/**
 * remove a processing step by its index (within {@link #procList}) or its name
 *
 * NOTE: if multiple processing steps with the same name exist, the first one is removed
 *
 * @param  {String} proc the name of the processing step
 * @param  {Number} [startIndex] OPTIONAL start index for searching (DEFAULT: 0)
 * @return {Number} the index of the processing step, or -1, if there is no such processing step
 *
 * @see #addProc
 * @see #removeProc
 * @see #procList
 */
GrammarConverter.prototype.getProcIndex = function(procName, startIndex){
	startIndex = startIndex || 0;//NOTE if startIndex is 0, or'ed value (0) is also valid
	for(var i=startIndex, size = this.procList.length; i < size; ++i){
		if(this.procList[i].name === procName){
			return i;
		}
	}
	return -1;
};

/**
 * initialize default pre- and post-processing steps:
 *
 * * "escape": escape/unescape special characters (see {@link #maskString}, {@link #unmaskString})<br/>
 * * "stopwords": remove stopwords (see {@link #removeStopwords})
 *
 * @private
 * @see #addProc
 * @see #removeProc
 * @see #getProcIndex
 * @see #procList
 */
GrammarConverter.prototype.initDefaultProc = function(){

	this.addProc({
		name: 'escape',
		pre: posUtil.createPosPreProc(this.maskString, this),
		post: function(procResult, _pos){
			return this.unmaskJSON(procResult);
		}
	});

	this.addProc({
		name: 'stopwords',
		pre: posUtil.createPosPreProc(this.removeStopwords, this)
	});
};

/**
 * Execute the grammar.
 *
 * NOTE: do not use directly, but {@link mmir.SemanticInterpreter#interpret} instead,
 * 		since that function applies some pre- and post-processing to the text (stopword removal
 * 		en-/decoding of special characters etc.).
 *
 * @param {String} text
 * 			the text String that should be parse.
 * @param {Object} [options]
 * 			additional parsing options (some grammar engines may support further options)
 * 				options.debug: BOOLEAN enable printing debug information
 * 				options.trace: BOOLEAN | FUNCTION enable printing verbose/tracing information (may not be supported by the grammar engine)
 * @param {Function} [callback]
 * 			if #isAsyncExec is TRUE, then executeGrammar will have no return value, but instead the result
 * 			of the grammar execution is delivered by the <code>callback</code>:
 * 			<pre>function callback(result){ ... }</pre>
 * 			(see also description of <code>return</code> value below)
 * @returns {Object}
 * 			the result of the grammar execution:
 * 			<pre>{phrase: STRING, phrases: OBJECT[], semantic: OBJECT}</pre>
 *
 * 			The property <code>phrase</code> contains the <code>text</code> which was matched (with removed stopwords).
 *
 * 			The property <code>phrases</code> contains the matched <tt>TOKENS</tt> and <tt>UTTERANCES</tt> from
 * 			the JSON definition of the grammar as properties as arrays
 *          (e.g. for 1 matched TOKEN "token": <code>{token: ["the matched text"]}</code>).
 *
 *          The returned property <code>semantic</code> depends on the JSON definition of the grammar.
 *
 *          NOTE: if #isAsyncExec is TRUE, then there will be no return value, but instead the callback
 *                is invoked with the return value.
 *
 */
GrammarConverter.prototype.executeGrammar = function(text, options, callback){
	console.warn('GrammarConverter.executeGrammar: this is only a stub. No grammar implementation set yet, ignoring executeGrammar() with arguments', text, options, callback);
};

//TODO move masking/recoding functions to separate utility module?

/**
 * Masks unicoded characters strings.
 *
 * Unicode characters are mask by replacing them with
 * <code>~~XXXX~~</code>
 * where <code>XXXX</code> is the four digit unicode HEX number.
 *
 * <p>
 * NOTE that this function is <em>stable</em> with regard to
 * multiple executions:
 *
 * If the function is invoked on the returned String again, the
 * returned String will be the same / unchanged, i.e.
 * maskings (i.e. "~~XXXX~~") will not be masked again.
 * </p>
 * <p>
 * NOTE: currently, the masking pattern cannot be escaped,
 * 		 i.e. if the original String contains a substring
 * 		 that matches the masking pattern, it cannot
 * 		 be escaped, so that the unmask-function
 * 		 will leave it untouched.
 * </p>
 *
 * @param {String} str
 * 				the String to process
 * @param {Boolean} [computePositions] OPTIONAL
 * 				DEFAULT: false
 * @param {String} [prefix] OPTIONAL
 * 				an alternative prefix used for masking, i.e instead of <code>~~</code>
 * 				(ignored, if argument has other type than <code>string</code>)
 * @param {String} [postfix] OPTIONAL
 * 				an alternative postfix used for masking, i.e instead of <code>~~</code>
 * 				(ignored, if argument has other type than <code>string</code>)
 * @returns {String|{str: String, pos: ARRAY<Position>}}
 * 				the masked string, or if <code>computePositions</code> was <code>true</code>
 * 				a result object with
 * 				<pre>
 * 				{
 * 					text: STRING, // the masked string
 * 					pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER}
 * 				}
 * 				</pre>
 * 				where POSITION is an object with
 * 				<pre>
 * 				{
 * 					i: NUMBER, // the index within the modified string
 * 					len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked)
 * 					mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked)
 * 				}
 * 				</pre>
 */
GrammarConverter.prototype.maskString = function (str, computePositions, prefix, postfix) {
	var i, ch, peek, result,
		next, endline, push, mask,
		source = str;

	var positions, esclen;//<- will only be used, if computePositions === TRUE

	//shift arguments if necessary
	if(typeof computePositions === 'string'){
		postfix = prefix;
		prefix = computePositions;
		computePositions = false;
	}

	var ESC_START = typeof prefix  === 'string'? prefix  : '~~';
	var ESC_END   = typeof postfix === 'string'? postfix : '~~';

	// Stash the next character and advance the pointer
	next = function () {
		peek = source.charAt(i);
		i += 1;
	};

	// Start a new "line" of output, to be joined later by <br />
	endline = function () {
		result.push('\n');
	};

	mask = function (theChar) {

		if(computePositions){
			//store position information for the masking:
			// i: position in original string
			// len: modified length of the string, i.e. the length of masking string
			// mlen: original length of the string, i.e. the length of the string that will get masked (in this case it is always 1, i.e. 1 char)
			positions.push({i: i-2, len: esclen, mlen: theChar.length});//<needed?:> , start: result.length});//<- would need to compute the actual position from current result-buffer content...
		}

		result.push(ESC_START);

		var theUnicode = theChar.charCodeAt(0).toString(16).toUpperCase();
		var j = theUnicode.length;
		while (j < 4) {
			result.push('0');
			++j;
		}
		result.push(theUnicode);

		result.push(ESC_END);
	};

	// Push a character or its entity onto the current line
	push = function () {

		//handle NEWLINE:
		if (ch === '\r' || ch === '\n') {
			if (ch === '\r') {
				if (peek === '\n') {
					next();
				}
				endline();
			}
			if (ch === '\n') {
				if (peek === '\r') {
					next();
				}
				endline();
			}
		}
		//handle tabs
		else if (ch === '\t') {
			result.push(ch);
		}
		//handle NON-ASCII
		else if (ch < ' ' || ch > '~') {
			mask( ch );
		}
		//handle normal chars
		else {
			result.push(ch);
		}
	};


	result = [];
	if(computePositions){
		esclen = ESC_START.length + 4 + ESC_END.length;
		positions = [];
	}

	i = 0;
	next();
	while (i <= source.length) { // less than or equal, because i is always one ahead
		ch = peek;
		next();

		push();
	}

//	//FIXM DEBUG: show position-logging for masking
//	if(computePositions && positions.length > 0){
//		console.log('_______LOG-mask-pos("'+str+'" -> "'+result.join('')+'"): ');
//		var lres = result.join('');
//		var loffset = 0;
//		for(var li = 0, lsize = positions.length; li < lsize; ++li){
//			var lpos = positions[li];
//			console.log('    '+JSON.stringify(lpos) + ' "'+str.substring(lpos.i, lpos.i + 1)+'" -> "'+lres.substring(loffset + lpos.i, loffset + lpos.i +lpos.len )+'"');
//			loffset += lpos.len - 1;
//		}
//	}//END: DEBUG

	if(computePositions){
		return {text: result.join(''), pos: positions};
	}
	return result.join('');
};

/**
 * HELPER uses #maskString for encoding non-ASCII chars to their Unicode representation,
 * i.e. <code>\uXXXX</code> where XXXX is the Unicode HEX number.
 *
 *
 * SHORTCUT for calling <code>maskString(str, '\\u', '')</code>.
 *
 * @param {String} str the string for unicode masking
 * @param {Boolean} [computePositions] OPTIONAL
 * 				DEFAULT: false
 * @returns {String|{str: String, pos: ARRAY<Position>}}
 * 				the unicode-masked string, or if <code>computePositions</code> was <code>true</code>
 * 				a result object with
 * 				<pre>
 * 				{
 * 					text: STRING, // the masked string
 * 					pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER}
 * 				}
 * 				</pre>
 * 				where POSITION is an object with
 * 				<pre>
 * 				{
 * 					i: NUMBER, // the index within the modified string
 * 					len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked)
 * 					mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked)
 * 				}
 * 				</pre>
 *
 * @example
 * //for Japanese "下さい" ("please")
 * maskAsUnicode("下さい") // -> "\u4E0B\u3055\u3044"
 *
 * //... and using default masking:
 * maskString("下さい") // -> "~~4E0B~~~~3055~~~~3044~~"
 */
GrammarConverter.prototype.maskAsUnicode = function (str, computePositions) {
	return this.maskString(str, computePositions, '\\u', '');
};

/**
 * Unmasks <i>masked unicoded characters</i> in a string.
 *
 * Masked unicode characters are assumed to have the pattern:
 * <code>~~XXXX~~</code>
 * where <code>XXXX</code> is the four digit unicode HEX number.
 *
 * <p>
 * NOTE that this function is <em>stable</em> with regard to
 * multiple executions, <b>IF</b> the original String <tt>str</tt> did not
 * contain a sub-string that conforms to the encoding pattern
 * (see remark for {@link #maskString}):
 *
 * If the function is invoked on the returned String again, the
 * returned String will be the same, i.e. unchanged.
 * </p>
 *
 * @param {String} str
 * @param {Boolean} [computePositions] OPTIONAL
 * 				DEFAULT: false
 * @param {RegExp} [detector] OPTIONAL
 * 				an alternative detector-RegExp:
 * 				the RegExp must conatin at least one grouping which detects a unicode number (HEX),
 * 				e.g. default detector is <code>~~([0-9|A-F|a-f]{4})~~</code> (note the grouping
 * 				for detecting a 4-digit HEX number within the brackets).
 * @returns {String|{str: String, pos: ARRAY<Position>}}
 * 				the masked string, or if <code>computePositions</code> was <code>true</code>
 * 				a result object with
 * 				<pre>
 * 				{
 * 					text: STRING, // the masked string
 * 					pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER}
 * 				}
 * 				</pre>
 * 				where POSITION is an object with
 * 				<pre>
 * 				{
 * 					i: NUMBER, // the index within the modified string
 * 					len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked)
 * 					mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked)
 * 				}
 * 				</pre>
 */
GrammarConverter.prototype.unmaskString = function (str, computePositions, detector) {
	var match, mlen, ch, positions, source = str, result = [], pos = 0, i, len = str.length;

	//shift arguments if necessary
	if(typeof computePositions === 'object'){
		detector = computePositions;
		computePositions = false;
	}

	if(computePositions){
		positions = [];
	}

	//RegExpr for: ~~XXXX~~
	// where XXXX is the unicode HEX number: ~~([0-9|A-F|a-f]{4})~~
	var REGEXPR_ESC = detector? detector : new RegExp( this.enc_regexp_str, "igm");

	while(match = REGEXPR_ESC.exec(source)){

		i =  match.index;
		mlen = match[0].length;

		//add previous:
		if(i > pos){
			result.push(source.substring(pos, i));
		}

		//add matched ESC as UNICODE:
		ch = String.fromCharCode(  parseInt(match[1], 16) );
		result.push(ch);

		//update position:
		pos = i + mlen;

		if(computePositions){
			//store position information for the masking:
			// i: position in original string
			// len: modified length of the string, i.e. the length of the unmasked string
			// mlen: original length of the string, i.e. the length of the masked string, that will get unmasked
			positions.push({i: i, len: ch.length, mlen: mlen});
		}
	}

	if(pos < len){
		result.push(source.substring(pos));
	}

//	//FIXM DEBUG: show position-logging for masking
//	if(computePositions && positions.length > 0){
//		console.log('--------LOG-UNMASK-pos("'+str+'" -> "'+result.join('')+'"): ');
//		var lres = result.join('');
//		var loffset = 0;
//		for(var li = 0, lsize = positions.length; li < lsize; ++li){
//			var lpos = positions[li];
//			console.log('    '+JSON.stringify(lpos) + ' "'+str.substring(lpos.i, lpos.i + lpos.mlen)+'" -> "'+lres.substring(loffset + lpos.i, loffset + lpos.i + lpos.len)+'"');
//			loffset += lpos.len - lpos.mlen;
//		}
//	}//END: DEBUG

	if(computePositions){
		return {text: result.join(''), pos: positions};
	}
	return result.join('');
};


GrammarConverter.prototype.maskJSON = function (json, isMaskValues, isMaskNames) {
	return this.recodeJSON(json, this.maskString, isMaskValues, isMaskNames);
};

GrammarConverter.prototype.unmaskJSON = function (json, isMaskValues, isMaskNames) {
	return this.recodeJSON(json, this.unmaskString, isMaskValues, isMaskNames);
};

/**
 * Recodes Strings of a JSON-like object.
 *
 * @function
 * @param {Object} json
 * 					the JSON-like object (i.e. PlainObject)
 *
 * @param {Function} recodeFunc
 * 								the "recoding" function for modifying String values:
 * 								 must accecpt a String argument and return a String
 * 									<code>String recodeFunc(String)</code>.
 * 								The <tt>recodeFunc</tt> function is invoked in context of the GrammarConverter object.
 * 								Example: this.maskString().
 * 								See {@link #maskString}.k
 *
 * @param {Boolean} [isMaskValues] OPTIONAL
 * 								 if true, the object's property String values will be processed
 * 								 NOTE: in case this parameter is specified, then <code>recodeFunc</code> must
 * 									   also be specified!
 * 								 DEFAULT: uses property {@link #maskValues}
 * @param {Boolean} [isMaskNames]  OPTIONAL
 * 								 if true, the property names will be processed
 * 								 NOTE: in case this parameter is specified, then <code>recodeFunc</code> and
 * 									   <code>isMaskValues</code> must also be specified!
 * 								 DEFAULT: uses property {@link #maskNames}
 *
 * @returns {Object} the recoded JSON object
 *
 * @requires util/isArray
 */
GrammarConverter.prototype.recodeJSON = (function (isArray) {//<- NOTE this is only the initializer (i.e. see returned function below)

	/**
	 * HELPER for sorting position objects
	 *
	 * @private
	 */
	var sortPosFunc = function(pos1, pos2){
		return pos1.target.i - pos2.target.i;
	};

	/**
	 * HELPER for setting a recoded string value
	 *
	 * @param {StringResult|String} recodedVal
	 * 				the recoding-result:
	 * 				<pre>{str: STRING, pos: ARRAY<POSITION>}</pre>
	 *
	 * 				If undefined, nothing will be done
	 *
	 * @param {String} origVal
	 * 				the original string value (i.e. "un-recoded")
	 *
	 * @param {Object} obj
	 * 				the parent-object for the recoded string property
	 *
	 * @param {String} pname
	 * 				the property name in the parent-object for the recoded string property
	 *
	 * @param {Array<Position>} [recodedPositions] OPTIONAL
	 * 				if present, the modification information of the recoding will be added to the array
	 * 				The elements of the array:
	 * 				<pre>
	 * 				{
	 * 					target: Token, // the token that was modified/recoded
	 * 					mlen: NUMBER   // the length of the un-modified string (i.e. before recoding)
	 * 				}
	 * 				</pre>
	 * 				where Token:
	 * 				<pre>
	 * 				{
	 * 					i: NUMBER, // the index of the token w.r.t. to the input string
	 * 					tok: STRING, // the (recoded/modified) token
	 * 				}
	 * 				</pre>
	 * @private
	 */
	var setRecodedVal = function(recodedVal, origVal, obj, pname, recodedPositions){

		var recVal;
		if(typeof recodedVal === 'string'){
			recVal = recodedVal;
		} else if(typeof recodedVal !== 'undefined' && typeof recodedVal.text === 'string'){
			recVal = recodedVal.text;
		}

		//only set, if there was a recoding:
		if(typeof recVal !== 'undefined' && typeof recVal === 'string'){

			if(origVal !== recVal){
				//set recoded value
				var str = recVal;
				obj[pname] = str;
			}

			//special treatment for token-objects, i.e.
			// {
			//	tok: STRING,
			//	i: NUMBER
			// }
			//
			// -> store some information for recalculating the index, in case tokens were recoded
			if(pname === 'tok' && typeof obj.i === 'number'){

//				var offset = 0;
//				var pos;
//				for(var i=recodedVal.pos.length-1; i >= 0; --i){
//					pos = recodedVal.pos[i];
//					offset += pos.mlen - pos.len;
//				}
				var modLen = origVal.length;// offset + str.length;
//				if(offset + str.length !== origVal.length){
//					console.error('ERROR: unexpected length!!!!');
//				}

//				obj.len = origVal.length - offset;
//				if(obj.len !== obj.tok.length){
//					console.error('ERROR: unexpected length!!!!');
//				}

				if(recodedPositions){
					recodedPositions.push({target: obj, mlen: modLen});//, i: start});//recodedVal);
				}
			}
		}
	};

	/**
	 * HELPER for adjusting the index-information in token-objects of an SemanticResult
	 *        (w.r.t. recoded tokens).
	 *
	 * @param {Array} recodedPositions
	 * 			the list with modification information w.r.t. the tokens (as created by setRecodedVal)
	 *
	 * @see #setRecodedVal
	 * @private
	 */
	var recalculatePos = function(recodedPositions){
		if(recodedPositions && recodedPositions.length > 0){

//			console.log('__________RECODE_pre-sort__'+JSON.stringify(recodedPositions));//FIXM DEBUG

			recodedPositions.sort(sortPosFunc);

//			console.log('__________RECODE_post-sort_'+JSON.stringify(recodedPositions));//FIXM DEBUG

			var repos, token;
			var offset = 0;
			for(var i=0, size = recodedPositions.length; i < size; ++i){
				repos = recodedPositions[i];
				token = repos.target;
				token.i -= offset;
				offset += repos.mlen - token.tok.length;
			}
		}
	};

	/**
	 * Recursive processing for an object / recoding a JSON-like object.
	 * NOTE: the recoding happens "in-place", i.e. the object itself is modified
	 *
	 * See doc of recodeJSON() for details w.r.t. the arguments
	 *
	 * NOTE: argument recodedPositions is an internal (OPTIONAL) parameter
	 *       that is used when recoding SemanticResult objects (applied grammar)
	 *
	 * @returns {PlainObject} the object where its string-values are recoded
	 * @private
	 */
	var processJSON = function(obj, recodeFunc, isMaskValues, isMaskNames, recodedPositions){

		//different treatments for: STRING, ARRAY, OBJECT types (and 'REST' type, i.e. all others)
		if(typeof obj === 'string' && isMaskValues){
			//STRING: encode the string
			return recodeFunc.call(this, obj, true);
		}
		else if( isArray(obj) ) {
			//ARRAY: process all entries:
			for(var i=0, size = obj.length; i < size; ++i){

				var pv = obj[i];

				var pvn = processJSON.call(this, pv, recodeFunc, isMaskValues, isMaskNames, recodedPositions);
				setRecodedVal(pvn, pv, obj, i, recodedPositions);
			}

			return obj;
		}
		else if(obj === null) {//NOTE null is typeof object!
			return null;
		}
		else if(typeof obj === 'object') {
			//OBJECT: process all the object's properties (but only, if they are not inherited)
			for(var p in obj){
				if(obj.hasOwnProperty(p)){

					var pv = obj[p];

					//special treatment for token-lists, i.e. elements like:
					//
					// phrases: [
					//	 {
					//		tok: STRING | ARRAY<TOK>,
					//		type: STRING,
					//		i: NUMBER
					//	 },
					//	 ...
					// ]
					//
					// -> create list for storing some information for recalculating the index, in case tokens were recoded
					var isCalcPos = false;
					if(!recodedPositions && p === 'phrases' && typeof pv === 'object' && pv){// typeof pv.i === 'number' && typeof pv.tok === 'string'){
						isCalcPos = true;
						recodedPositions = [];
					}

					var pvn = processJSON.call(this, pv, recodeFunc, isMaskValues, isMaskNames, recodedPositions);
					setRecodedVal(pvn, pv, obj, p, recodedPositions);

					if(isCalcPos){
						recalculatePos(recodedPositions);
						recodedPositions = void(0);
					}

					//if the property-name should also be encoded:
					if(typeof p === 'string' && isMaskNames){

						var masked = recodeFunc.call(this, p);
						if(masked && typeof masked.text === 'string' && masked.text !== p){
							obj[masked.text] = obj[p];
							delete obj[p];
						}
					}
				}
			}

			return obj;
		}
		else {
			return obj;
		}
	};

	return function (json, recodeFunc, isMaskValues, isMaskNames){
		//evaluate arguments:
		if(typeof isMaskValues === 'undefined'){
			isMaskValues = this.maskValues;
		}
		if(typeof isMaskNames === 'undefined'){
			isMaskNames = this.maskNames;
		}

		return processJSON.call(this, json, recodeFunc, isMaskValues, isMaskNames);
	};

})(isArray);//<- dependency util/isArray


return GrammarConverter;

});//END: define(..., function(){
MMIR Framework

Namespaces

Interfaces

Classes

Modules

Source: semantic/grammarConverter.js