Source: semantic/positionUtils.js

  1. define(['mmirf/util/isArray'],
  2. /**
  3. * Utilities for handling position information in pre-/post-processing
  4. * functions before executing grammars/NLU functions.
  5. *
  6. * The position information is meant to trac the input-words' positions, so
  7. * that the returned grammar/NLU etc. results can be mapped to the input-string
  8. * again, e.g. so that it is possible to map
  9. * <pre>
  10. * ~ "match for token at [3, 8]" -> "sub-string [8,16] in input-string"
  11. * </pre>
  12. *
  13. *
  14. * @class
  15. * @public
  16. * @name PositionUtils
  17. * @memberOf mmir.grammar
  18. * @hideconstructor
  19. *
  20. * @see mmir.grammar.GrammarConverter
  21. * @see mmir.grammar.GrammarConverter#addProc
  22. *
  23. * @example
  24. *
  25. * var posUtil = mmir.require('mmirf/positionUtils');
  26. * posUtil.createWordPosPreProc(someFunction, aGrammarConverterInstance);
  27. * ...
  28. */
  29. function(isArray){
  30. /**
  31. * HELPER create pre-processing function that handles string|Positions argument
  32. *
  33. * @param {Function} preprocFunc the preprocessing function
  34. * @param {any} ctx context for executing the preprocessing function
  35. *
  36. * @returns {Function} wrapper-function for <code>preprocFunc</code> that handles <code>Positions</code> input arguments
  37. *
  38. * @private
  39. * @memberOf mmir.grammar.PositionUtils
  40. */
  41. function _createPosPreProc (preProcFunc, ctx){
  42. return function(thePhrase, pos){
  43. var str = thePhrase;
  44. if(typeof str === 'object'){
  45. if(!pos){
  46. pos = str.pos;
  47. }
  48. str = str.text;
  49. }
  50. return preProcFunc.call(ctx, str, !!pos);
  51. }
  52. }
  53. /**
  54. * HELPER create pre-processing function that handles string|Positions argument
  55. * where the pre-processing function handles single "words":
  56. * input string is split by whitespaces, and then processed word by word;
  57. * the position information is automatically generated
  58. *
  59. * @param {Function} wordPreprocFunc the preprocessing function that handles single words
  60. * @param {any} ctx context for executing the preprocessing function
  61. * @param {RegExp} [splitRegExp] regular expression for splitting (~ "tokenizing") words
  62. * DEFAULT: <pre>/\s+/g</pre>
  63. *
  64. * @returns {Function} wrapper-function for <code>wordPreprocFunc</code> that handles <code>Positions</code>
  65. * input arguments and tracks position-modifications for <code>wordPreprocFunc</code>
  66. *
  67. * @private
  68. * @memberOf mmir.grammar.PositionUtils
  69. */
  70. function _createWordPosPreProc(wordProcFunc, ctx, splitRegExp){
  71. var re = splitRegExp || /\s+/g;
  72. return _createPosPreProc(function(str, pos){
  73. var result, m, i = 0;
  74. re.lastIndex = 0;
  75. while((m = re.exec(str))){
  76. result = doProcWord(wordProcFunc, str, result, pos, i, m.index, m[0], ctx);
  77. i = m.index + m[0].length;
  78. }
  79. if(i > 0 && i < str.length){
  80. result = doProcWord(wordProcFunc, str, result, pos, i, str.length, '', ctx);
  81. } else if(i === 0){
  82. result = wordProcFunc(ctx, str, !!pos);
  83. }
  84. return result;
  85. }, ctx);
  86. }
  87. function doProcWord(wordProcFunc, str, result, pos, prev_i, index, match_str, ctx){
  88. var substr = str.substring(prev_i, index);
  89. var res = wordProcFunc.call(ctx, substr, !!pos);
  90. if(pos){
  91. var wordPos = doCalcPos(substr, res);
  92. if(!result){
  93. result = {text: '', pos: []};
  94. }
  95. result.text += res + match_str;
  96. if (wordPos.length > 0){
  97. wordPos.forEach(function(p){
  98. p.i += prev_i;
  99. result.pos.push(p);
  100. });
  101. }
  102. } else {
  103. result = (result? result : '') + res + match_str;
  104. }
  105. return result;
  106. };
  107. function doCalcPos(origStr, newStr){
  108. var l1 = origStr.length;
  109. var l2 = newStr.length;
  110. if(l1 !== l2){
  111. return [{i: 0, mlen: l1, len: l2}];
  112. }
  113. return [];
  114. }
  115. /**
  116. * HELPER re-calculate the positions for 1-n steps of the pre-processing chain,
  117. * so that positions at step i do refer to the positions of the input-string instead of the pre-processed string from step i-1
  118. *
  119. * NOTE positions are changed "in-place"!
  120. *
  121. * @param {PositionsInfo} pos the positions information as processed by the {@link mmir.grammar.GrammarConverter#preproc} function
  122. *
  123. * @private
  124. * @memberOf mmir.grammar.PositionUtils
  125. */
  126. function _recalcProcPos(pos){
  127. var order = pos._order;
  128. if(isArray(order)){
  129. var size = order.length;
  130. var curr_i = 0;
  131. var next = function(){
  132. var el;
  133. for(var i = curr_i; i < size; ++i){
  134. el = pos[order[i]];
  135. if(isArray(el) && el.length > 0){
  136. curr_i = i + 1;
  137. return el;
  138. }
  139. }
  140. }
  141. var source = next();
  142. if(source){
  143. var sources = [source], len = 1, target = next(), i;
  144. while(target){
  145. for(i=len-1; i >= 0; --i){
  146. _recalcPos(sources[i], target);
  147. }
  148. sources.push(target);
  149. ++len;
  150. target = next();
  151. }
  152. }
  153. }
  154. }
  155. /**
  156. * HELPER re-calculate the positions in <code>targetPos</code> according to <code>sourcePos</code>:
  157. * i.e. re-calculate the positions in <code>targetPos</code> so, as if <code>sourcePos</code> had not been applied.
  158. *
  159. * NOTE positions are changed "in-place" in targetPos
  160. *
  161. * @param {Array<Pos>} sourcePos the positions that should be used for re-calculation (e.g. from pre-processig step i-1)
  162. * @param {Array<Pos>} targetPos the positions that should be changed/adjusted (e.g. from pre-processig step i)
  163. *
  164. * @private
  165. * @memberOf mmir.grammar.PositionUtils
  166. */
  167. function _recalcPos(sourcePos, targetPos){
  168. // console.log('___________masking-input-pos: '+JSON.stringify(sourcePos));
  169. // console.log('___________stopword-input-pos: '+JSON.stringify(targetPos));
  170. //recalculate target positions w.r.t. reverted source positions:
  171. var offset = 0, mi = 0, msize = sourcePos.length;
  172. var spos, tpos, tposend, mlen, sposi, sposend, revertOffset;
  173. for(var i1=0, size1 = targetPos.length; i1 < size1; ++i1){
  174. tpos = targetPos[i1];
  175. for(; mi < msize; ++mi){
  176. //-> loop over source-positions to calculate offset (i.e. adjustment) for tpos...
  177. spos = sourcePos[mi];
  178. sposi = spos.i + offset;
  179. tposend = tpos.i + tpos.mlen;
  180. if(tposend <= sposi){
  181. //if target-entry ends before source-entry starts:
  182. // we already tried all source-entries that could have effected the target-entry
  183. //-> continue with next target-entry
  184. break;
  185. }
  186. mlen = spos.len - spos.mlen;//<- length difference due to modification
  187. offset += mlen;//<- offset for source-entry strings, after modification was applied
  188. sposend = sposi + spos.len;
  189. if(sposend < tpos.i){
  190. //if source-position ends before target-entry even begins:
  191. // offset needs to be applied to target-entry "in full"
  192. // -> continue with next source-entry position,
  193. // in case "more offset" needs to be applied
  194. continue;
  195. }
  196. if(sposi <= tpos.i){
  197. // -> source-position started before or with target-position...
  198. revertOffset = false;
  199. if(sposi >= tpos.i && sposend <= tposend){
  200. //if source-position occurs completely within target-entry:
  201. //adjust target-modification-length
  202. tpos.mlen = tpos.mlen - mlen;
  203. //... end revert index-adjustment (see below)
  204. revertOffset = true;
  205. } else if(sposend >= tposend){
  206. //if target ends before source -> revert index-adjustment (see below)
  207. revertOffset = true;
  208. }
  209. if(revertOffset){
  210. //need to "pre-adjust" index, since offset was already (in this case falsely) adjusted
  211. tpos.i += mlen;
  212. }
  213. } else {
  214. //... otherwise continue with next target-entry
  215. break;
  216. }
  217. }
  218. tpos.i -= offset;
  219. }
  220. // //FIXM DEBUG
  221. // console.log('__RECONST__stopword-input-pos: '+JSON.stringify(targetPos));
  222. // for(var li = 0, lsize = targetPos.length; li < lsize; ++li){
  223. // var lpos = targetPos[li];
  224. // console.log(' '+JSON.stringify(lpos) + ' "'+thePhrase.substring(lpos.i, lpos.i + lpos.mlen)+'"');
  225. // }
  226. // //FIXM DEBUG END
  227. }
  228. /**
  229. * @memberOf mmir.grammar.PositionUtils
  230. */
  231. return {
  232. /**
  233. * @copydoc ._createPosPreProc
  234. * @public
  235. * @function
  236. * @memberOf mmir.grammar.PositionUtils
  237. */
  238. createPosPreProc: _createPosPreProc,
  239. /**
  240. * @copydoc ._createWordPosPreProc
  241. * @public
  242. * @function
  243. * @memberOf mmir.grammar.PositionUtils
  244. */
  245. createWordPosPreProc: _createWordPosPreProc,
  246. /**
  247. * @copydoc ._recalcProcPos
  248. * @public
  249. * @function
  250. * @memberOf mmir.grammar.PositionUtils
  251. */
  252. recalcProcPos: _recalcProcPos,
  253. /**
  254. * @copydoc ._recalcPos
  255. * @public
  256. * @function
  257. * @memberOf mmir.grammar.PositionUtils
  258. */
  259. recalcPos: _recalcPos
  260. }
  261. });