Source: semantic/grammarConverter.js

  1. /**
  2. * The GrammarConverter object initializes the grammar for processing
  3. * <em>natural language text</em>, e.g. from the voice recognition.
  4. *
  5. * @class
  6. * @name GrammarConverter
  7. *
  8. * @requires util/loadFile
  9. * @requires util/isArray
  10. */
  11. define(['mmirf/util/isArray', 'mmirf/util/loadFile'], function(isArray, loadFile){
  12. /**
  13. * @ignore
  14. *
  15. * @constructs GrammarConverter
  16. */
  17. function GrammarConverter(){
  18. this.variable_prefix = "_$";
  19. this.variable_regexp = /"(_\$[^\"]*)"/igm;// /"_$([^\"]*)/igm;
  20. this.entry_token_field = "tok";//must consist of ASCI "word chars", i.e. not whitepaces, numbers etc.
  21. this.entry_index_field = "i";//must consist of ASCI "word chars", i.e. not whitepaces, numbers etc.
  22. this.entry_type_field = "type";//must consist of ASCI "word chars", i.e. not whitepaces, numbers etc.
  23. //regular expression for detecting encoded chars (see mask/unmask functions)
  24. this.enc_regexp_str = "~~([0-9|A-F|a-f]{4})~~";
  25. this.jscc_grammar_definition = "";
  26. this.js_grammar_definition = "";
  27. this.json_grammar_definition = null;
  28. this.stop_words_regexp;
  29. //default setting for masking value Strings in JSON values (see maskJSON() / unmaskJSON)
  30. this.maskValues = true;
  31. //default setting for masking property-name Strings in JSON values (see maskJSON() / unmaskJSON)
  32. // WARNING: this is actually EXPERIMENTAL; it should be set to false, since JS/CC may not be able to handle masked ID names...
  33. this.maskNames = false;
  34. //if execution of the grammar is asynchronously done (i.e. result is delivered using a callback)
  35. this.is_async = false;
  36. };
  37. GrammarConverter.prototype.loadGrammar = function(successCallback, errorCallback, grammarUrl, doLoadSynchronously){
  38. var self = this;
  39. var success = function(data, status, xhr){
  40. self.json_grammar_definition = data;
  41. if (typeof successCallback == "function") {
  42. successCallback.call(this, self, xhr);
  43. }
  44. };
  45. var error = function(xhr, status, data){
  46. if (typeof errorCallback == "function") {
  47. errorCallback.call(this, self);
  48. } else {
  49. console.error("failed to load the grammar! error: "+ JSON.stringify(data));
  50. }
  51. };
  52. this.loadResource(success, error, grammarUrl, doLoadSynchronously);
  53. };
  54. GrammarConverter.prototype.loadResource = function(successCallback, errorCallback, resourceUrl, doLoadSynchronously){
  55. var theUrl = resourceUrl;
  56. if(!theUrl){
  57. console.error('GrammarConverter.loadResource: missing URL!');
  58. if(errorCallback){
  59. errorCallback.call(this, this);
  60. }
  61. return;///////////////// EARLY EXIT //////////////////////
  62. }
  63. var isLoadAsync = false;
  64. if(typeof doLoadSynchronously !== 'undefined' && doLoadSynchronously === false){
  65. isLoadAsync = true;
  66. }
  67. loadFile({
  68. async: isLoadAsync,
  69. dataType: 'json',
  70. url:theUrl,
  71. success: successCallback,
  72. error: errorCallback
  73. });
  74. };
  75. GrammarConverter.prototype.setStopWords = function(stopWordArray){
  76. if(!this.json_grammar_definition){
  77. this.json_grammar_definition = {};
  78. }
  79. this.json_grammar_definition.stop_word = this.maskJSON(stopWordArray);
  80. this.parseStopWords();
  81. //use unmask-function in order to ensure masking/unmasking is reversible
  82. // (or in case it is not: the error will be held in property stop_word)
  83. this.json_grammar_definition.stop_word = this.unmaskJSON(this.json_grammar_definition.stop_word);
  84. };
  85. GrammarConverter.prototype.getStopWords = function(){
  86. if(!this.json_grammar_definition){
  87. return null;
  88. }
  89. return this.json_grammar_definition.stop_word;
  90. };
  91. /**
  92. * HELPER creates a copy of the stopword list and encodes all non-ASCII chars to their unicode
  93. * representation (e.g. for save storage of stringified stopword list, even if file-encoding
  94. * does not support non-ASCII letters).
  95. *
  96. * @returns {Array<String>} a copy of the stopword list, from the current JSON grammar
  97. * (or empty list, if no grammar is present)
  98. */
  99. GrammarConverter.prototype.getEncodedStopwords = function(){
  100. var list = this.getStopWords();
  101. if(!list){
  102. return [];
  103. }
  104. //use copy, since recoding works in-place (we do not want to modify the stored stopword list here)
  105. list = list.slice(0, list.length);
  106. //store stopwords with their Unicode representation (only for non-ASCII chars)
  107. return this.recodeJSON(
  108. list, this.maskAsUnicode
  109. );
  110. };
  111. //this is the original / main implementation for creating the RegExp for stopword removal
  112. GrammarConverter.prototype.parseStopWords = function(){
  113. //create RegExp for stop words:
  114. var json_stop_words = this.json_grammar_definition.stop_word;
  115. var size = json_stop_words.length;
  116. var stop_words = "";
  117. //FIX for encoded chars: if a word begins or ends with an encoded char, \b cannot detect the word's boundaries
  118. // -> FIX if we encounter such words, create a separate RegExpr that uses
  119. // whitespaces & START-/END-expression for detecting word-boundaries, i.e. something like: (\s|^)(~~ ... words ... ~~)(\s|$)
  120. //
  121. // NOTE: the word-boundaries expression \b seems to have no effect in case of non-ASCII chars in general
  122. // (e.g. for Japanese characters / words)
  123. // .... so we would need to use this alternative mechanism (e.g. using whitespaces & START-/END-expr.)
  124. // even if these characters were not encoded!
  125. var encStartTester = new RegExp("^" + this.enc_regexp_str ,"gm");
  126. var encEndTester = new RegExp( this.enc_regexp_str + "$","gm");
  127. var enc_stop_words = "";
  128. var isEncWord = function(str){
  129. return encStartTester.test(str) || encEndTester.test(str);
  130. };
  131. if(size > 0){
  132. //... then the RegExp matches each stopword:
  133. for(var index=0; index < size ; ++index){
  134. var stop_word = json_stop_words[index];
  135. //special treatment for word that begin/end with encoded chars:
  136. if(isEncWord(stop_word)){
  137. if(enc_stop_words.length === 0){
  138. enc_stop_words = "(\\s|^)(";
  139. }
  140. else {
  141. enc_stop_words += "|";
  142. }
  143. enc_stop_words += stop_word;
  144. continue;
  145. }
  146. //... for "normal" stopwords:
  147. if (stop_words.length > 0){
  148. stop_words += "|"; //... if there is already a previous stopword-entry: do add OR-matching ...
  149. }
  150. stop_words += stop_word; //... add the stopword "stop_word"
  151. }
  152. }
  153. if(stop_words.length > 0){
  154. stop_words =
  155. "\\b(" //starting at a word-boundary (-> ignore within-word matches)
  156. + stop_words
  157. + ")"
  158. + "\\b" //... ending with a word-boundary -> avoid "cutting out" matching partial strings
  159. // e.g. without \b: '(in)\s?' would match (and cut out all matches) within "winning" -> "wng"
  160. + "\\s?"; //... and optionally: one white-character that follows the stopword
  161. }
  162. else {
  163. //for empty stopword definition: match empty string
  164. // (basically: remove nothing)
  165. stop_words += '^$';
  166. }
  167. this.stop_words_regexp = new RegExp(stop_words,"igm"); //RegExp options:
  168. // ignore-case (i),
  169. // match globally i.e. all occurrences in the String (g),
  170. // do not stop at line breaks (m)
  171. //only create ReExp for special stopwords, if we actually have at least 1 of those:
  172. //NOTE for replacement, we need to use a space-char (i.e. replace these with spaces, not empty strings: str.replace(..., ' '); )
  173. if(enc_stop_words.length > 0){
  174. enc_stop_words += ")(\\s|$)";
  175. this.stop_words_regexp_enc = new RegExp(enc_stop_words,"igm");
  176. }
  177. };
  178. GrammarConverter.prototype.getStopWordsRegExpr = function(){
  179. if(!this.stop_words_regexp){
  180. this.parseStopWords();
  181. }
  182. return this.stop_words_regexp;
  183. };
  184. /**
  185. * FIX for stopwords that start or end with encoded chars (i.e. non-ASCII chars)
  186. *
  187. * This RegExp may be NULL/undefined, if no stopwords exist, that begin/end with encoded chars
  188. * i.e. you need to check for NULL, before trying to use this RegExpr.
  189. *
  190. * Usage:
  191. * @example
  192. *
  193. * //remove normal stopwords:
  194. * var removedStopwordsStr = someStr.replace( gc.getStopWordsRegExpr(), '');
  195. *
  196. *
  197. * var removedStopwordsStr2 = removedStopwordsStr;
  198. * if(gc.getStopWordsEncRegExpr()){
  199. * //NOTE replace stopwords with spaces (not with empty String as above, ie. with "normal" stopwords)
  200. * removedStopwordsStr2 = gc.getStopWordsEncRegExpr().replace( gc.getStopWordsEncRegExpr(), ' ');
  201. * }
  202. */
  203. GrammarConverter.prototype.getStopWordsEncRegExpr = function(){
  204. if(!this.stop_words_regexp){
  205. this.parseStopWords();
  206. }
  207. return this.stop_words_regexp_enc;
  208. };
  209. /**
  210. * Get grammar definition text.
  211. *
  212. * This is the "source code" input for the grammar compiler
  213. * (i.e. syntax for jison, PEG.js or JS/CC).
  214. *
  215. * The grammar definition text is generated from the JSON grammar.
  216. *
  217. * @returns {String} the grammar definition in compiler-specific syntax
  218. */
  219. GrammarConverter.prototype.getGrammarDef = function(){
  220. return this.jscc_grammar_definition;
  221. };
  222. /**
  223. * Sets the grammar definition text.
  224. *
  225. * This function should only be used during compilation of the JSON grammar
  226. * to the executable grammar.
  227. *
  228. * NOTE: Setting this "manually" will have no effect on the executable grammar.
  229. *
  230. * @see #getGrammarDef
  231. * @protected
  232. *
  233. * @param {String} rawGrammarSyntax
  234. * the grammar definition in compiler-specific syntax
  235. */
  236. GrammarConverter.prototype.setGrammarDef = function(rawGrammarSyntax){
  237. this.jscc_grammar_definition = rawGrammarSyntax;
  238. };
  239. /**
  240. * Get the compiled JavaScript grammar source code.
  241. *
  242. * This is the output of the grammar compiler (with additional
  243. * JavaScript "framing" in SemanticInterpreter.createGrammar).
  244. *
  245. * This needs to be eval'ed before it can be executed (eval() will add
  246. * the corresponding executable grammar to SemanticInterpreter).
  247. *
  248. * @returns {String} the compiled, JavaScript grammar source code
  249. */
  250. GrammarConverter.prototype.getGrammarSource = function(){
  251. return this.js_grammar_definition;
  252. };
  253. GrammarConverter.prototype.setGrammarSource = function(src_code){
  254. this.js_grammar_definition = src_code;
  255. };
  256. /**
  257. * Set the executable grammar function.
  258. *
  259. * The grammar function takes a String argument: the text that should be parsed.
  260. * a Function argument: the callback for the result.
  261. * where the callback itself takes 1 argument for the result: <code>callback(result)</code>
  262. *
  263. * The returned result depends on the JSON definition of the grammar:
  264. * <code>func(inputText, resultCallback)</code>
  265. *
  266. *
  267. * @param {Function} func
  268. * the executable grammar function: <code>func(string, object, function(object)) : object</code>
  269. * @param {Boolean} [isAsnc] OPTIONAL
  270. * set to TRUE, if execution is asynchronously done.
  271. * DEFAULT: FALSE
  272. *
  273. * @see #exectueGrammar
  274. */
  275. GrammarConverter.prototype.setGrammarFunction = function(func, isAsync){
  276. this.is_async = !!isAsync;
  277. this.executeGrammar = func;
  278. };
  279. GrammarConverter.prototype.isAsyncExec = function(){
  280. return this.is_async;
  281. };
  282. /**
  283. *
  284. * @param {String} thePhrase
  285. * the string from which to remove stopwords (and trim()'ed)
  286. * @param {Array<Position>} [positions] OPTIONAL
  287. * if provided, the positions at which stopwords were removed will be added
  288. * to this array, where each position-object is comprised of
  289. * <pre>
  290. * {
  291. * i: NUMBER the index at which the stopword was removed
  292. * mlen: NUMBER the length of the stopword that was removed
  293. * }
  294. * </pre>
  295. * the positions will order by occurance (i.e. by <code>pos.i</code>)
  296. *
  297. * @returns {String}
  298. * the string where stopwords were removed
  299. */
  300. GrammarConverter.prototype.removeStopwords = function(thePhrase, positions){
  301. var stop_words_regexp = this.getStopWordsRegExpr();
  302. var str = thePhrase;
  303. var replStr,//<- replacement string used in removeFunc
  304. appendPos,//<- controls if position-info should append or prepended to position-list
  305. replOffset,//<- global offset (i.e. offset with regard to input string thePhrase)
  306. iCalc,//<- helper index for calculating offset in modified strings
  307. calcPos,//<- helper function for calculating offset in modified strings
  308. replPositions,//<- helper/temporary positions-array for calculating offset in modified strings
  309. removeFunc;//<- replacement-function that also tracks the positions that were modified (via argument positions)
  310. if(positions){
  311. //initialize helpers for tracking positions
  312. replOffset = 0;
  313. iCalc = 0;
  314. appendPos = true;
  315. removeFunc = function(){//HELPER for matched stopwords: log its position and remove it
  316. var argLen = arguments.length;
  317. var match = arguments[0];
  318. var offset = arguments[argLen-2];
  319. if(positions){
  320. var index = calcPos(offset);
  321. // //FIXM DEBUG
  322. // var word = argLen === 4? arguments[1] : (argLen === 6? arguments[2] : 'WHITESPACE');
  323. // var start = index;
  324. // var end = start + match.length;
  325. // var isError = word !== 'WHITESPACE'? thePhrase.substring(start, end).trim() !== word : !/\s+/.test(thePhrase.substring(start, end));
  326. // console[isError? 'error' : 'log']('matched "'+match+'" -> found stopword "'+word+'" from '+start+' to '+end+ ' -> "'+thePhrase.substring(start, end)+'"');
  327. //// console.log(' stopword-removal: ', arguments);
  328. // //FIXM DEBUG END
  329. if(appendPos){
  330. positions.push({i: index, mlen: match.length, len: replStr.length});
  331. } else {
  332. positions.unshift({i: index, mlen: match.length, len: replStr.length});
  333. }
  334. }
  335. return replStr;
  336. };
  337. calcPos = function(offset){
  338. if(!replPositions){
  339. return offset;
  340. }
  341. var pos;
  342. for(var size = replPositions.length; iCalc < size; ++iCalc){
  343. pos = replPositions[iCalc];
  344. if(pos.i > offset + replOffset){
  345. break;
  346. }
  347. replOffset += pos.mlen - pos.len;
  348. }
  349. return offset + replOffset;
  350. };
  351. }
  352. var encoded_stop_words_regexp = this.getStopWordsEncRegExpr();
  353. replStr = ' ';
  354. if(encoded_stop_words_regexp){
  355. // console.log('_______STOPWORD-rem-enc: "'+str+'"');//FIXM DEBUG
  356. str = str.replace(this.stop_words_regexp_enc, positions? removeFunc : replStr);
  357. if(positions){
  358. //update helper variables for calculating global offset (after string was modified):
  359. replOffset = 0;
  360. iCalc = 0;
  361. replPositions = positions.slice(0);
  362. }
  363. }
  364. // console.log('_______STOPWORD-rem: "'+str+'"');//FIXM DEBUG
  365. replStr = '';
  366. replLen = str.length;
  367. str = str.replace(stop_words_regexp, positions? removeFunc : replStr);
  368. if(positions){
  369. positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if encoded_stop_words_regexp was applied
  370. //update helper variables for calculating global offset (after string was modified):
  371. replOffset = 0;
  372. iCalc = 0;
  373. replPositions = positions.slice(0);
  374. }
  375. if(positions){
  376. //trim with tracking of positions
  377. // console.log('_______STOPWORD-rem-ws: "'+str+'"');//FIXM DEBUG
  378. replStr = '';
  379. str = str.replace(/\s+$/, removeFunc);//<- trim at end
  380. positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if words were removed from the end of the string
  381. //update helper variables for calculating global offset (after string was modified):
  382. replOffset = 0;
  383. iCalc = 0;
  384. replPositions = positions.slice(0);
  385. appendPos = false;//<- prepending "start-trimming"-position may not be accurate, but should be "nearly" correct (w.r.t. to ordering by index pos.i)
  386. str = str.replace(/^\s+/, removeFunc);//<- trim at beginning
  387. positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if words were removed from the beginning of the string
  388. // console.log('_______STOPWORD-positions: "'+JSON.stringify(positions)+'"');//FIXM DEBUG
  389. } else {
  390. str = str.trim();
  391. }
  392. // console.log(JSON.stringify(str));//FIXM DEBUG
  393. return str;
  394. };
  395. /**
  396. * Apply pre-processing to the string, before applying the grammar:
  397. * * mask non-ASCI characters
  398. * * remove stopwords
  399. *
  400. * @param {String} thePhrase
  401. * @param {PlainObject} [pos] OPTIONAL
  402. * in/out argument: if given, the pre-processor will add fields with information
  403. * on how the input string <code>thePhrase</code> was modified
  404. * Namely, the position information for removed stopwords will be added to
  405. * <code>pos.stopwords</code> (see {@link #removeStopwords} for more details)
  406. *
  407. * NOTE that this may not work, if custom <code>maskFunc</code> and/or <code>stopwordFunc</code>
  408. * are provided as well.
  409. *
  410. * @param {Function} [maskFunc] OPTIONAL
  411. * custom function for masking non-ASCI characters:
  412. * <pre>maskFunc(inputStr : STRING [, isCalcPosition: BOOLEAN]) : STRING | {str: STRING, pos: ARRAY<POSITION>}</pre>
  413. * DEFAUL: use of <code>this.maskString(thePhrase, !!pos)</code>
  414. *
  415. * @param {Function} [stopwordFunc] OPTIONAL
  416. * custom function for removing stopwords
  417. * <pre>stopwordFunc(inputStr : STRING [, positions: ARRAY]) : STRING | {str: STRING, pos: ARRAY<POSITION>}</pre>
  418. * DEFAUL: use of <code>this.removeStopwords(str, [])</code>
  419. *
  420. * NOTE that <code>maskFunc</code> must also be specified, if this argument is used
  421. *
  422. * @returns {String} the pre-processed string
  423. */
  424. GrammarConverter.prototype.preproc = function(thePhrase, pos, maskFunc, stopwordFunc){
  425. if(typeof pos === 'function'){
  426. stopwordFunc = maskFunc;
  427. maskFunc = pos;
  428. pos = void(0);
  429. }
  430. var str = maskFunc? maskFunc(thePhrase) : this.maskString(thePhrase, !!pos);
  431. var maskedPos;
  432. if(typeof str === 'object'){
  433. if(pos){
  434. maskedPos = str.pos;
  435. }
  436. str = str.str;
  437. }
  438. var stopwordPos;
  439. if(pos){
  440. stopwordPos = [];
  441. pos.stopwords = stopwordPos;
  442. }
  443. var result = stopwordFunc? stopwordFunc(str) : this.removeStopwords(str, stopwordPos);
  444. if(pos && stopwordPos.length > 0){
  445. // console.log('___________masking-input-pos: '+JSON.stringify(maskedPos));
  446. // console.log('___________stopword-input-pos: '+JSON.stringify(pos.stopwords));
  447. //recalculate stopword positions w.r.t. reverted masking:
  448. var offset = 0, mi = 0, msize = maskedPos.length;
  449. var mpos, spos, sposend, mlen, mposi;
  450. for(var i1=0, size1 = stopwordPos.length; i1 < size1; ++i1){
  451. spos = stopwordPos[i1];
  452. for(; mi < msize; ++mi){
  453. mpos = maskedPos[mi];
  454. mposi = mpos.i + offset;
  455. sposend = spos.i + spos.mlen;
  456. if(sposend <= mposi){
  457. //if stopword ends before masking starts:
  458. // we already tried all maskings that could have effected the stopword
  459. //-> continue with next stopword
  460. break;
  461. }
  462. mlen = mpos.len - mpos.mlen;//<- length difference due to modification
  463. offset += mlen;//<- offset for masked strings, after masking was applied (i.e. when stopwords are removed
  464. if(mposi < spos.i){
  465. //if masking-position starts before stopword even begins:
  466. // the masking can not effect the stopword
  467. //-> continue with next masking position
  468. continue;
  469. }
  470. if(mposi + mpos.len <= sposend){
  471. //if masking-position occurs within stopword:
  472. //adjust stopword-length
  473. spos.mlen = spos.mlen - mlen;
  474. //need to "pre-adjust" index, since offset was already (in this case falsely) adjusted
  475. spos.i += mlen;
  476. } else {
  477. //... otherwise continue with next stopword
  478. break;
  479. }
  480. }
  481. spos.i -= offset;
  482. }
  483. // //FIXM DEBUG
  484. // console.log('__RECONST__stopword-input-pos: '+JSON.stringify(pos.stopwords));
  485. // for(var li = 0, lsize = pos.stopwords.length; li < lsize; ++li){
  486. // var lpos = pos.stopwords[li];
  487. // console.log(' '+JSON.stringify(lpos) + ' "'+thePhrase.substring(lpos.i, lpos.i + lpos.mlen)+'"');
  488. // }
  489. // //FIXM DEBUG END
  490. }
  491. return result;
  492. };
  493. /**
  494. * Post-processes the result from the applied grammar:
  495. * * un-masks non-ASCI characters
  496. *
  497. * @param {SemanticResult} procResult
  498. * @param {Function} [recodeFunc]
  499. * function that recodes non-ASCI characters (or reverts the recoding)
  500. */
  501. GrammarConverter.prototype.postproc = function(procResult, recodeFunc){
  502. if(recodeFunc){
  503. return this.recodeJSON(procResult, recodeFunc);//this.decodeUmlauts(procResult, true);
  504. }
  505. //unmask previously mask non-ASCII chars in all Strings of the returned result:
  506. return this.unmaskJSON(
  507. procResult
  508. );
  509. };
  510. GrammarConverter.prototype.removeStopwords_alt = function(thePhrase){
  511. var stop_words_regexp = this.getStopWordsRegExpr_alt();
  512. while (thePhrase.match(stop_words_regexp)) {
  513. thePhrase = thePhrase.replace(stop_words_regexp, ' ');
  514. thePhrase = thePhrase.trim();
  515. }
  516. return thePhrase;
  517. };
  518. /**
  519. * Execute the grammar.
  520. *
  521. * NOTE: do not use directly, but {@link mmir.SemanticInterpreter.interpret} instead,
  522. * since that function applies some pre- and post-processing to the text (stopword removal
  523. * en-/decoding of special characters etc.).
  524. *
  525. * @param {String} text
  526. * the text String that should be parse.
  527. * @param {Object} [options]
  528. * additional parsing options (some grammar engines may support further options)
  529. * options.debug: BOOLEAN enable printing debug information
  530. * options.trace: BOOLEAN | FUNCTION enable printing verbose/tracing information (may not be supported by the grammar engine)
  531. * @param {Function} [callback]
  532. * if #isAsyncExec is TRUE, then executeGrammar will have no return value, but instead the result
  533. * of the grammar execution is delivered by the <code>callback</code>:
  534. * <pre>function callback(result){ ... }</pre>
  535. * (see also description of <code>return</code> value below)
  536. * @returns {Object}
  537. * the result of the grammar execution:
  538. * <code>{phrase: STRING, phrases: ARRAY<OBJECT>, semantic: OBJECT}</code>
  539. *
  540. * The property <code>phrase</code> contains the <code>text</code> which was matched (with removed stopwords).
  541. *
  542. * The property <code>phrases</code> contains the matched <tt>TOKENS</tt> and <tt>UTTERANCES</tt> from
  543. * the JSON definition of the grammar as properties as arrays
  544. * (e.g. for 1 matched TOKEN "token": <code>{token: ["the matched text"]}</code>).
  545. *
  546. * The returned property <code>semantic</code> depends on the JSON definition of the grammar.
  547. *
  548. * NOTE: if #isAsyncExec is TRUE, then there will be no return value, but instead the callback
  549. * is invoked with the return value.
  550. *
  551. */
  552. GrammarConverter.prototype.executeGrammar = function(text, options, callback){
  553. console.warn('GrammarConverter.executeGrammar: this is only a stub. No grammar implementation set yet...');
  554. };
  555. //TODO move code-wrapper generator functions to separate generator module?
  556. /**
  557. * Get code-prefix for wrapping generated, executable grammars.
  558. *
  559. * @param {Number} fileFormatVersion
  560. * the file format (see {@link mmir.SemanticInterpreter#getFileVersion})
  561. * @param {String} execMode
  562. * the execution mode for the generated grammar: 'sync' | 'async'
  563. *
  564. * @returns {String} the prefix code for generated grammars (i.e. prepend to generated grammar code)
  565. *
  566. * @see mmir.parser#STORAGE_CODE_WRAP_PREFIX
  567. */
  568. GrammarConverter.prototype.getCodeWrapPrefix = function(fileFormatVersion, execMode){
  569. return '(function(global){\n' +
  570. 'var mmirName = typeof MMIR_CORE_NAME === "string"? MMIR_CORE_NAME : "mmir";\n'+
  571. 'var mmir = global? global[mmirName] : void(0);\n'+
  572. 'var require = mmir && mmir.require? mmir.require : (typeof requirejs !== "undefined"? requirejs : (global? global.require : require));\n'+
  573. 'var semanticInterpreter = require("mmirf/semanticInterpreter");\n'+
  574. 'var options = {fileFormat:'+fileFormatVersion+',execMode:'+JSON.stringify(execMode)+'};\n';
  575. };
  576. /**
  577. * Get code-suffix for wrapping generated, executable grammars.
  578. *
  579. * @param {Array<string>} encodedStopwords
  580. * the list of encoded stopwords (see {@link #getEncodedStopwords})
  581. * @param {String} grammarFuncName
  582. * the (variable's) name of the grammar function that was generated
  583. * (and will be used in {@link #executeGrammar})
  584. * @param {String} grammarId
  585. * the ID for the grammar (e.g. language code) with which the grammar
  586. * will be registered with SemanticInterpreter (see {@link mmir.SemanticInterpreter#addGrammar})
  587. *
  588. * @returns {String} the suffix code for generated grammars (i.e. append to generated grammar code)
  589. *
  590. * @see mmir.parser#STORAGE_CODE_WRAP_SUFFIX
  591. */
  592. GrammarConverter.prototype.getCodeWrapSuffix = function(encodedStopwords, grammarFuncName, grammarId){
  593. return '\noptions.stopwords=' +
  594. //store stopwords with their Unicode representation (only for non-ASCII chars)
  595. JSON.stringify(encodedStopwords).replace(/\\\\u/gm,'\\u') +//<- revert JSON.stringify encoding for the Unicodes
  596. ';\n' +
  597. //add "self registering" for the grammar-function
  598. // i.e. register the grammar-function for the ID with the SemanticInterpreter
  599. 'semanticInterpreter.addGrammar("' +
  600. grammarId + '", ' + grammarFuncName + ', options);\n\n' +
  601. 'return ' + grammarFuncName + ';\n' +
  602. '})(typeof window !== "undefined" ? window : typeof self !== "undefined" ? self : typeof global !== "undefined" ? global : this);\n'
  603. };
  604. //TODO move masking/recoding functions to separate utility module?
  605. /**
  606. * Masks unicoded characters strings.
  607. *
  608. * Unicode characters are mask by replacing them with
  609. * <code>~~XXXX~~</code>
  610. * where <code>XXXX</code> is the four digit unicode HEX number.
  611. *
  612. * <p>
  613. * NOTE that this function is <em>stable</em> with regard to
  614. * multiple executions:
  615. *
  616. * If the function is invoked on the returned String again, the
  617. * returned String will be the same / unchanged, i.e.
  618. * maskings (i.e. "~~XXXX~~") will not be masked again.
  619. * </p>
  620. * <p>
  621. * NOTE: currently, the masking pattern cannot be escaped,
  622. * i.e. if the original String contains a substring
  623. * that matches the masking pattern, it cannot
  624. * be escaped, so that the unmask-function
  625. * will leave it untouched.
  626. * </p>
  627. *
  628. * @param {String} str
  629. * the String to process
  630. * @param {Boolean} [computePositions] OPTIONAL
  631. * DEFAULT: false
  632. * @param {String} [prefix] OPTIONAL
  633. * an alternative prefix used for masking, i.e instead of <code>~~</code>
  634. * (ignored, if argument has other type than <code>string</code>)
  635. * @param {String} [postfix] OPTIONAL
  636. * an alternative postfix used for masking, i.e instead of <code>~~</code>
  637. * (ignored, if argument has other type than <code>string</code>)
  638. * @returns {String|{str: String, pos: ARRAY<Position>}}
  639. * the masked string, or if <code>computePositions</code> was <code>true</code>
  640. * a result object with
  641. * <pre>
  642. * {
  643. * str: STRING, // the masked string
  644. * pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER}
  645. * }
  646. * </pre>
  647. * where POSITION is an object with
  648. * <pre>
  649. * {
  650. * i: NUMBER, // the index within the modified string
  651. * len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked)
  652. * mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked)
  653. * }
  654. * </pre>
  655. */
  656. GrammarConverter.prototype.maskString = function (str, computePositions, prefix, postfix) {
  657. var i, s, ch, peek, result,
  658. next, endline, push, mask,
  659. spaces, source = str;
  660. var positions, esclen;//<- will only be used, if computePositions === TRUE
  661. //shift arguments if necessary
  662. if(typeof computePositions === 'string'){
  663. postfix = prefix;
  664. prefix = computePositions;
  665. computePositions = false;
  666. }
  667. var ESC_START = typeof prefix === 'string'? prefix : '~~';
  668. var ESC_END = typeof postfix === 'string'? postfix : '~~';
  669. // Stash the next character and advance the pointer
  670. next = function () {
  671. peek = source.charAt(i);
  672. i += 1;
  673. };
  674. // Start a new "line" of output, to be joined later by <br />
  675. endline = function () {
  676. result.push('\n');
  677. };
  678. mask = function (theChar) {
  679. if(computePositions){
  680. //store position information for the masking:
  681. // i: position in original string
  682. // len: modified length of the string, i.e. the length of masking string
  683. // mlen: original length of the string, i.e. the length of the string that will get masked (in this case it is always 1, i.e. 1 char)
  684. positions.push({i: i-2, len: esclen, mlen: theChar.length});//<needed?:> , start: result.length});//<- would need to compute the actual position from current result-buffer content...
  685. }
  686. result.push(ESC_START);
  687. var theUnicode = theChar.charCodeAt(0).toString(16).toUpperCase();
  688. var j = theUnicode.length;
  689. while (j < 4) {
  690. result.push('0');
  691. ++j;
  692. }
  693. result.push(theUnicode);
  694. result.push(ESC_END);
  695. };
  696. // Push a character or its entity onto the current line
  697. push = function () {
  698. //handle NEWLINE:
  699. if (ch === '\r' || ch === '\n') {
  700. if (ch === '\r') {
  701. if (peek === '\n') {
  702. next();
  703. }
  704. endline();
  705. }
  706. if (ch === '\n') {
  707. if (peek === '\r') {
  708. next();
  709. }
  710. endline();
  711. }
  712. }
  713. //handle tabs
  714. else if (ch === '\t') {
  715. result.push(ch);
  716. }
  717. //handle NON-ASCII
  718. else if (ch < ' ' || ch > '~') {
  719. mask( ch );
  720. }
  721. //handle normal chars
  722. else {
  723. result.push(ch);
  724. }
  725. };
  726. result = [];
  727. if(computePositions){
  728. esclen = ESC_START.length + 4 + ESC_END.length;
  729. positions = [];
  730. }
  731. i = 0;
  732. next();
  733. while (i <= source.length) { // less than or equal, because i is always one ahead
  734. ch = peek;
  735. next();
  736. push();
  737. }
  738. // //FIXM DEBUG: show position-logging for masking
  739. // if(computePositions && positions.length > 0){
  740. // console.log('_______LOG-mask-pos("'+str+'" -> "'+result.join('')+'"): ');
  741. // var lres = result.join('');
  742. // var loffset = 0;
  743. // for(var li = 0, lsize = positions.length; li < lsize; ++li){
  744. // var lpos = positions[li];
  745. // console.log(' '+JSON.stringify(lpos) + ' "'+str.substring(lpos.i, lpos.i + 1)+'" -> "'+lres.substring(loffset + lpos.i, loffset + lpos.i +lpos.len )+'"');
  746. // loffset += lpos.len - 1;
  747. // }
  748. // }//END: DEBUG
  749. if(computePositions){
  750. return {str: result.join(''), pos: positions};
  751. }
  752. return result.join('');
  753. };
  754. /**
  755. * HELPER uses #maskString for encoding non-ASCII chars to their Unicode representation,
  756. * i.e. <code>\uXXXX</code> where XXXX is the Unicode HEX number.
  757. *
  758. *
  759. * SHORTCUT for calling <code>maskString(str, '\\u', '')</code>.
  760. *
  761. * @example
  762. * //for Japanese "下さい" ("please")
  763. * maskAsUnicode("下さい") -> "\u4E0B\u3055\u3044"
  764. *
  765. * //... and using default masking:
  766. * maskString("下さい") -> "~~4E0B~~~~3055~~~~3044~~"
  767. */
  768. GrammarConverter.prototype.maskAsUnicode = function (str) {
  769. return this.maskString(str, '\\u', '');
  770. };
  771. /**
  772. * Unmasks <i>masked unicoded characters</i> in a string.
  773. *
  774. * Masked unicode characters are assumed to have the pattern:
  775. * <code>~~XXXX~~</code>
  776. * where <code>XXXX</code> is the four digit unicode HEX number.
  777. *
  778. * <p>
  779. * NOTE that this function is <em>stable</em> with regard to
  780. * multiple executions, <b>IF</b> the original String <tt>str</tt> did not
  781. * contain a sub-string that conforms to the encoding pattern
  782. * (see remark for {@link #maskString}):
  783. *
  784. * If the function is invoked on the returned String again, the
  785. * returned String will be the same, i.e. unchanged.
  786. * </p>
  787. *
  788. * @param {String} str
  789. * @param {Boolean} [computePositions] OPTIONAL
  790. * DEFAULT: false
  791. * @param {RegExp} [detector] OPTIONAL
  792. * an alternative detector-RegExp:
  793. * the RegExp must conatin at least one grouping which detects a unicode number (HEX),
  794. * e.g. default detector is <code>~~([0-9|A-F|a-f]{4})~~</code> (note the grouping
  795. * for detecting a 4-digit HEX number within the brackets).
  796. * @returns {String|{str: String, pos: ARRAY<Position>}}
  797. * the masked string, or if <code>computePositions</code> was <code>true</code>
  798. * a result object with
  799. * <pre>
  800. * {
  801. * str: STRING, // the masked string
  802. * pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER}
  803. * }
  804. * </pre>
  805. * where POSITION is an object with
  806. * <pre>
  807. * {
  808. * i: NUMBER, // the index within the modified string
  809. * len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked)
  810. * mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked)
  811. * }
  812. * </pre>
  813. */
  814. GrammarConverter.prototype.unmaskString = function (str, computePositions, detector) {
  815. var match, mlen, ch, positions, source = str, result = [], pos = 0, i, len = str.length;
  816. //shift arguments if necessary
  817. if(typeof computePositions === 'object'){
  818. detector = computePositions;
  819. computePositions = false;
  820. }
  821. if(computePositions){
  822. positions = [];
  823. }
  824. //RegExpr for: ~~XXXX~~
  825. // where XXXX is the unicode HEX number: ~~([0-9|A-F|a-f]{4})~~
  826. var REGEXPR_ESC = detector? detector : new RegExp( this.enc_regexp_str, "igm");
  827. while(match = REGEXPR_ESC.exec(source)){
  828. i = match.index;
  829. mlen = match[0].length;
  830. //add previous:
  831. if(i > pos){
  832. result.push(source.substring(pos, i));
  833. }
  834. //add matched ESC as UNICODE:
  835. ch = String.fromCharCode( parseInt(match[1], 16) );
  836. result.push(ch);
  837. //update position:
  838. pos = i + mlen;
  839. if(computePositions){
  840. //store position information for the masking:
  841. // i: position in original string
  842. // len: modified length of the string, i.e. the length of the unmasked string
  843. // mlen: original length of the string, i.e. the length of the masked string, that will get unmasked
  844. positions.push({i: i, len: ch.length, mlen: mlen});
  845. }
  846. }
  847. if(pos < len){
  848. result.push(source.substring(pos));
  849. }
  850. // //FIXM DEBUG: show position-logging for masking
  851. // if(computePositions && positions.length > 0){
  852. // console.log('--------LOG-UNMASK-pos("'+str+'" -> "'+result.join('')+'"): ');
  853. // var lres = result.join('');
  854. // var loffset = 0;
  855. // for(var li = 0, lsize = positions.length; li < lsize; ++li){
  856. // var lpos = positions[li];
  857. // console.log(' '+JSON.stringify(lpos) + ' "'+str.substring(lpos.i, lpos.i + lpos.mlen)+'" -> "'+lres.substring(loffset + lpos.i, loffset + lpos.i + lpos.len)+'"');
  858. // loffset += lpos.len - lpos.mlen;
  859. // }
  860. // }//END: DEBUG
  861. if(computePositions){
  862. return {str: result.join(''), pos: positions};
  863. }
  864. return result.join('');
  865. };
  866. GrammarConverter.prototype.maskJSON = function (json, isMaskValues, isMaskNames) {
  867. return this.recodeJSON(json, this.maskString, isMaskValues, isMaskNames);
  868. };
  869. GrammarConverter.prototype.unmaskJSON = function (json, isMaskValues, isMaskNames) {
  870. return this.recodeJSON(json, this.unmaskString, isMaskValues, isMaskNames);
  871. };
  872. /**
  873. * Recodes Strings of a JSON-like object.
  874. *
  875. * @function
  876. * @param {Object} json
  877. * the JSON-like object (i.e. PlainObject)
  878. *
  879. * @param {Function} recodeFunc
  880. * the "recoding" function for modifying String values:
  881. * must accecpt a String argument and return a String
  882. * <code>String recodeFunc(String)</code>.
  883. * The <tt></tt> function is invoked in context of the GrammarConverter object.
  884. * Example: this.maskString().
  885. * See {@link #maskString}.k
  886. *
  887. * @param {Boolean} [isMaskValues] OPTIONAL
  888. * if true, the object's property String values will be processed
  889. * NOTE: in case this parameter is specified, then <code>recodeFunc</code> must
  890. * also be specified!
  891. * DEFAULT: uses property {@link #maskValues}
  892. * @param {Boolean} [isMaskNames] OPTIONAL
  893. * if true, the property names will be processed
  894. * NOTE: in case this parameter is specified, then <code>recodeFunc</code> and
  895. * <code>isMaskValues</code> must also be specified!
  896. * DEFAULT: uses property {@link #maskNames}
  897. *
  898. * @returns {Object} the recoded JSON object
  899. *
  900. * @requires util/isArray
  901. */
  902. GrammarConverter.prototype.recodeJSON = (function (isArray) {//<- NOTE this is only the initializer (i.e. see returned function below)
  903. /**
  904. * HELPER for sorting position objects
  905. *
  906. * @private
  907. */
  908. var sortPosFunc = function(pos1, pos2){
  909. return pos1.target.i - pos2.target.i;
  910. };
  911. /**
  912. * HELPER for setting a recoded string value
  913. *
  914. * @param {StringResult|String} recodedVal
  915. * the recoding-result:
  916. * <pre>{str: STRING, pos: ARRAY<POSITION>}</pre>
  917. *
  918. * If undefined, nothing will be done
  919. *
  920. * @param {String} origVal
  921. * the original string value (i.e. "un-recoded")
  922. *
  923. * @param {Object} obj
  924. * the parent-object for the recoded string property
  925. *
  926. * @param {String} pname
  927. * the property name in the parent-object for the recoded string property
  928. *
  929. * @param {Array<Position>} [recodedPositions] OPTIONAL
  930. * if present, the modification information of the recoding will be added to the array
  931. * The elements of the array:
  932. * <pre>
  933. * {
  934. * target: Token, // the token that was modified/recoded
  935. * mlen: NUMBER // the length of the un-modified string (i.e. before recoding)
  936. * }
  937. * </pre>
  938. * where Token:
  939. * <pre>
  940. * {
  941. * i: NUMBER, // the index of the token w.r.t. to the input string
  942. * tok: STRING, // the (recoded/modified) token
  943. * }
  944. * </pre>
  945. * @private
  946. */
  947. var setRecodedVal = function(recodedVal, origVal, obj, pname, recodedPositions){
  948. var recVal;
  949. if(typeof recodedVal === 'string'){
  950. recVal = recodedVal;
  951. } else if(typeof recodedVal !== 'undefined' && typeof recodedVal.str === 'string'){
  952. recVal = recodedVal.str;
  953. }
  954. //only set, if there was a recoding:
  955. if(typeof recVal !== 'undefined' && typeof recVal === 'string'){
  956. if(origVal !== recVal){
  957. //set recoded value
  958. var str = recVal;
  959. obj[pname] = str;
  960. }
  961. //special treatment for token-objects, i.e.
  962. // {
  963. // tok: STRING,
  964. // i: NUMBER
  965. // }
  966. //
  967. // -> store some information for recalculating the index, in case tokens were recoded
  968. if(pname === 'tok' && typeof obj.i === 'number'){
  969. // var offset = 0;
  970. // var pos;
  971. // for(var i=recodedVal.pos.length-1; i >= 0; --i){
  972. // pos = recodedVal.pos[i];
  973. // offset += pos.mlen - pos.len;
  974. // }
  975. var modLen = origVal.length;// offset + str.length;
  976. // if(offset + str.length !== origVal.length){
  977. // console.error('ERROR: unexpected length!!!!');
  978. // }
  979. // obj.len = origVal.length - offset;
  980. // if(obj.len !== obj.tok.length){
  981. // console.error('ERROR: unexpected length!!!!');
  982. // }
  983. if(recodedPositions){
  984. recodedPositions.push({target: obj, mlen: modLen});//, i: start});//recodedVal);
  985. }
  986. }
  987. }
  988. };
  989. /**
  990. * HELPER for adjusting the index-information in token-objects of an SemanticResult
  991. * (w.r.t. recoded tokens).
  992. *
  993. * @param {Array} recodedPositions
  994. * the list with modification information w.r.t. the tokens (as created by setRecodedVal)
  995. *
  996. * @see #setRecodedVal
  997. * @private
  998. */
  999. var recalculatePos = function(recodedPositions){
  1000. if(recodedPositions && recodedPositions.length > 0){
  1001. // console.log('__________RECODE_pre-sort__'+JSON.stringify(recodedPositions));//FIXM DEBUG
  1002. recodedPositions.sort(sortPosFunc);
  1003. // console.log('__________RECODE_post-sort_'+JSON.stringify(recodedPositions));//FIXM DEBUG
  1004. var repos, token;
  1005. var offset = 0;
  1006. for(var i=0, size = recodedPositions.length; i < size; ++i){
  1007. repos = recodedPositions[i];
  1008. token = repos.target;
  1009. token.i -= offset;
  1010. offset += repos.mlen - token.tok.length;
  1011. }
  1012. }
  1013. };
  1014. /**
  1015. * Recursive processing for an object / recoding a JSON-like object.
  1016. * NOTE: the recoding happens "in-place", i.e. the object itself is modified
  1017. *
  1018. * See doc of recodeJSON() for details w.r.t. the arguments
  1019. *
  1020. * NOTE: argument recodedPositions is an internal (OPITONAL) parameter
  1021. * that is used when recoding SemanticResult objects (applied grammar)
  1022. *
  1023. * @returns {PlainObject} the object where its string-values are recoded
  1024. * @private
  1025. */
  1026. var processJSON = function(obj, recodeFunc, isMaskValues, isMaskNames, recodedPositions){
  1027. //different treatments for: STRING, ARRAY, OBJECT types (and 'REST' type, i.e. all others)
  1028. if(typeof obj === 'string' && isMaskValues){
  1029. //STRING: encode the string
  1030. return recodeFunc.call(this, obj, true);
  1031. }
  1032. else if( isArray(obj) ) {
  1033. //ARRAY: process all entries:
  1034. for(var i=0, size = obj.length; i < size; ++i){
  1035. var pv = obj[i];
  1036. var pvn = processJSON.call(this, pv, recodeFunc, isMaskValues, isMaskNames, recodedPositions);
  1037. setRecodedVal(pvn, pv, obj, i, recodedPositions);
  1038. }
  1039. return obj;
  1040. }
  1041. else if(obj === null) {//NOTE null is typeof object!
  1042. return null;
  1043. }
  1044. else if(typeof obj === 'object') {
  1045. //OBJECT: process all the object's properties (but only, if they are not inherited)
  1046. for(var p in obj){
  1047. if(obj.hasOwnProperty(p)){
  1048. var pv = obj[p];
  1049. //special treatment for token-lists, i.e. elements like:
  1050. //
  1051. // phrases: {
  1052. // token1:[
  1053. // {
  1054. // tok: STRING,
  1055. // i: NUMBER
  1056. // },
  1057. // ...
  1058. // ]
  1059. // token2:
  1060. // ...
  1061. // }
  1062. //
  1063. // -> create list for storing some information for recalculating the index, in case tokens were recoded
  1064. var isCalcPos = false;
  1065. if(!recodedPositions && p === 'phrases' && typeof pv === 'object' && pv){// typeof pv.i === 'number' && typeof pv.tok === 'string'){
  1066. isCalcPos = true;
  1067. recodedPositions = [];
  1068. }
  1069. var pvn = processJSON.call(this, pv, recodeFunc, isMaskValues, isMaskNames, recodedPositions);
  1070. setRecodedVal(pvn, pv, obj, p, recodedPositions);
  1071. if(isCalcPos){
  1072. recalculatePos(recodedPositions);
  1073. recodedPositions = void(0);
  1074. }
  1075. //if the property-name should also be encoded:
  1076. if(typeof p === 'string' && isMaskNames){
  1077. var masked = recodeFunc.call(this, p);
  1078. if(masked && typeof masked.str === 'string' && masked.str !== p){
  1079. obj[masked.str] = obj[p];
  1080. delete obj[p];
  1081. }
  1082. }
  1083. }
  1084. }
  1085. return obj;
  1086. }
  1087. else {
  1088. return obj;
  1089. }
  1090. };
  1091. return function (json, recodeFunc, isMaskValues, isMaskNames){
  1092. //evaluate arguments:
  1093. if(typeof isMaskValues === 'undefined'){
  1094. isMaskValues = this.maskValues;
  1095. }
  1096. if(typeof isMaskNames === 'undefined'){
  1097. isMaskNames = this.maskNames;
  1098. }
  1099. return processJSON.call(this, json, recodeFunc, isMaskValues, isMaskNames);
  1100. };
  1101. })(isArray);//<- dependency util/isArray
  1102. return GrammarConverter;
  1103. });//END: define(..., function(){