define(['mmirf/util/isArray'],
/**
* Utilities for handling position information in pre-/post-processing
* functions before executing grammars/NLU functions.
*
* The position information is meant to trac the input-words' positions, so
* that the returned grammar/NLU etc. results can be mapped to the input-string
* again, e.g. so that it is possible to map
* <pre>
* ~ "match for token at [3, 8]" -> "sub-string [8,16] in input-string"
* </pre>
*
*
* @class
* @public
* @name PositionUtils
* @memberOf mmir.grammar
* @hideconstructor
*
* @see mmir.grammar.GrammarConverter
* @see mmir.grammar.GrammarConverter#addProc
*
* @example
*
* var posUtil = mmir.require('mmirf/positionUtils');
* posUtil.createWordPosPreProc(someFunction, aGrammarConverterInstance);
* ...
*/
function(isArray){
/**
* HELPER create pre-processing function that handles string|Positions argument
*
* @param {Function} preprocFunc the preprocessing function
* @param {any} ctx context for executing the preprocessing function
*
* @returns {Function} wrapper-function for <code>preprocFunc</code> that handles <code>Positions</code> input arguments
*
* @private
* @memberOf mmir.grammar.PositionUtils
*/
function _createPosPreProc (preProcFunc, ctx){
return function(thePhrase, pos){
var str = thePhrase;
if(typeof str === 'object'){
if(!pos){
pos = str.pos;
}
str = str.text;
}
return preProcFunc.call(ctx, str, !!pos);
}
}
/**
* HELPER create pre-processing function that handles string|Positions argument
* where the pre-processing function handles single "words":
* input string is split by whitespaces, and then processed word by word;
* the position information is automatically generated
*
* @param {Function} wordPreprocFunc the preprocessing function that handles single words
* @param {any} ctx context for executing the preprocessing function
* @param {RegExp} [splitRegExp] regular expression for splitting (~ "tokenizing") words
* DEFAULT: <pre>/\s+/g</pre>
*
* @returns {Function} wrapper-function for <code>wordPreprocFunc</code> that handles <code>Positions</code>
* input arguments and tracks position-modifications for <code>wordPreprocFunc</code>
*
* @private
* @memberOf mmir.grammar.PositionUtils
*/
function _createWordPosPreProc(wordProcFunc, ctx, splitRegExp){
var re = splitRegExp || /\s+/g;
return _createPosPreProc(function(str, pos){
var result, m, i = 0;
re.lastIndex = 0;
while((m = re.exec(str))){
result = doProcWord(wordProcFunc, str, result, pos, i, m.index, m[0], ctx);
i = m.index + m[0].length;
}
if(i > 0 && i < str.length){
result = doProcWord(wordProcFunc, str, result, pos, i, str.length, '', ctx);
} else if(i === 0){
result = wordProcFunc(ctx, str, !!pos);
}
return result;
}, ctx);
}
function doProcWord(wordProcFunc, str, result, pos, prev_i, index, match_str, ctx){
var substr = str.substring(prev_i, index);
var res = wordProcFunc.call(ctx, substr, !!pos);
if(pos){
var wordPos = doCalcPos(substr, res);
if(!result){
result = {text: '', pos: []};
}
result.text += res + match_str;
if (wordPos.length > 0){
wordPos.forEach(function(p){
p.i += prev_i;
result.pos.push(p);
});
}
} else {
result = (result? result : '') + res + match_str;
}
return result;
};
function doCalcPos(origStr, newStr){
var l1 = origStr.length;
var l2 = newStr.length;
if(l1 !== l2){
return [{i: 0, mlen: l1, len: l2}];
}
return [];
}
/**
* HELPER re-calculate the positions for 1-n steps of the pre-processing chain,
* so that positions at step i do refer to the positions of the input-string instead of the pre-processed string from step i-1
*
* NOTE positions are changed "in-place"!
*
* @param {PositionsInfo} pos the positions information as processed by the {@link mmir.grammar.GrammarConverter#preproc} function
*
* @private
* @memberOf mmir.grammar.PositionUtils
*/
function _recalcProcPos(pos){
var order = pos._order;
if(isArray(order)){
var size = order.length;
var curr_i = 0;
var next = function(){
var el;
for(var i = curr_i; i < size; ++i){
el = pos[order[i]];
if(isArray(el) && el.length > 0){
curr_i = i + 1;
return el;
}
}
}
var source = next();
if(source){
var sources = [source], len = 1, target = next(), i;
while(target){
for(i=len-1; i >= 0; --i){
_recalcPos(sources[i], target);
}
sources.push(target);
++len;
target = next();
}
}
}
}
/**
* HELPER re-calculate the positions in <code>targetPos</code> according to <code>sourcePos</code>:
* i.e. re-calculate the positions in <code>targetPos</code> so, as if <code>sourcePos</code> had not been applied.
*
* NOTE positions are changed "in-place" in targetPos
*
* @param {Array<Pos>} sourcePos the positions that should be used for re-calculation (e.g. from pre-processig step i-1)
* @param {Array<Pos>} targetPos the positions that should be changed/adjusted (e.g. from pre-processig step i)
*
* @private
* @memberOf mmir.grammar.PositionUtils
*/
function _recalcPos(sourcePos, targetPos){
// console.log('___________masking-input-pos: '+JSON.stringify(sourcePos));
// console.log('___________stopword-input-pos: '+JSON.stringify(targetPos));
//recalculate target positions w.r.t. reverted source positions:
var offset = 0, mi = 0, msize = sourcePos.length;
var spos, tpos, tposend, mlen, sposi, sposend, revertOffset;
for(var i1=0, size1 = targetPos.length; i1 < size1; ++i1){
tpos = targetPos[i1];
for(; mi < msize; ++mi){
//-> loop over source-positions to calculate offset (i.e. adjustment) for tpos...
spos = sourcePos[mi];
sposi = spos.i + offset;
tposend = tpos.i + tpos.mlen;
if(tposend <= sposi){
//if target-entry ends before source-entry starts:
// we already tried all source-entries that could have effected the target-entry
//-> continue with next target-entry
break;
}
mlen = spos.len - spos.mlen;//<- length difference due to modification
offset += mlen;//<- offset for source-entry strings, after modification was applied
sposend = sposi + spos.len;
if(sposend < tpos.i){
//if source-position ends before target-entry even begins:
// offset needs to be applied to target-entry "in full"
// -> continue with next source-entry position,
// in case "more offset" needs to be applied
continue;
}
if(sposi <= tpos.i){
// -> source-position started before or with target-position...
revertOffset = false;
if(sposi >= tpos.i && sposend <= tposend){
//if source-position occurs completely within target-entry:
//adjust target-modification-length
tpos.mlen = tpos.mlen - mlen;
//... end revert index-adjustment (see below)
revertOffset = true;
} else if(sposend >= tposend){
//if target ends before source -> revert index-adjustment (see below)
revertOffset = true;
}
if(revertOffset){
//need to "pre-adjust" index, since offset was already (in this case falsely) adjusted
tpos.i += mlen;
}
} else {
//... otherwise continue with next target-entry
break;
}
}
tpos.i -= offset;
}
// //FIXM DEBUG
// console.log('__RECONST__stopword-input-pos: '+JSON.stringify(targetPos));
// for(var li = 0, lsize = targetPos.length; li < lsize; ++li){
// var lpos = targetPos[li];
// console.log(' '+JSON.stringify(lpos) + ' "'+thePhrase.substring(lpos.i, lpos.i + lpos.mlen)+'"');
// }
// //FIXM DEBUG END
}
/**
* @memberOf mmir.grammar.PositionUtils
*/
return {
/**
* @copydoc ._createPosPreProc
* @public
* @function
* @memberOf mmir.grammar.PositionUtils
*/
createPosPreProc: _createPosPreProc,
/**
* @copydoc ._createWordPosPreProc
* @public
* @function
* @memberOf mmir.grammar.PositionUtils
*/
createWordPosPreProc: _createWordPosPreProc,
/**
* @copydoc ._recalcProcPos
* @public
* @function
* @memberOf mmir.grammar.PositionUtils
*/
recalcProcPos: _recalcProcPos,
/**
* @copydoc ._recalcPos
* @public
* @function
* @memberOf mmir.grammar.PositionUtils
*/
recalcPos: _recalcPos
}
});