User:Stevage/ANTLR
grammar headerline10; /* Bugs: at start of article */ options {output=AST;} tokens { ARTICLE; START; INTERNAL_LINK; ENTITY; RD; H1; H2; H3; H4; H5; H6; HR; P; IMG; NBSP; PROTOCOL; TEXT; // all subnodes should be concatenated BALD_URL; EXTERNAL_LINK; IMG; IMG_OPTION; IMG_OPTION_THUMBNAIL; ITALICS; BOLD; B_ON; B_OFF; BOLD_ITALICS; I_ON; I_OFF; BR; UL; OL; DL; LI; DD; DT; ISBN; RFC; PMID; PRE; NBSP160; // a real nbsp, like } @members { String _mw_image_namespace = "image"; boolean in_external_link=false; boolean in_header=false; boolean in_internal_link_caption=false; boolean in_defined_term=false; boolean text_bold=false; boolean text_italics=false; boolean literal_whitespace=false; int caption_levels = 0; int text_levels =0; boolean textis(String mw) { return input.LT(1).getText().equalsIgnoreCase(mw); } } @lexer::members { boolean in_nowiki = false; boolean in_noparse = false; boolean in_pre = false; boolean in_html = false; boolean in_listprefix = false; } start : (redirect | article) -> ^(START redirect? article?); ////////////////////////////////////////////////////////////////////// redirect: REDIRECT SPACE+ internal_link (ws? ((article)=>article)?) -> ^(RD internal_link article?); REDIRECT: {this.getCharPositionInLine()==0 && this.getLine()==1}? => '#REDIRECT'; ////////////////////////////////////////////////////////////////////// article: (N*) (line N paragraph_separator)* -> ^(ARTICLE (line paragraph_separator)* ); paragraph_separator: pn*; pn: N close_bold_italics -> close_bold_italics BR ; close_bold_italics @after {text_bold=false; text_italics = false;} : /* {text_bold==true && text_italics==true}? => -> B_OFF I_OFF |{text_bold==false && text_italics==true}? => -> I_OFF |{text_bold==true && text_italics==false}? => -> B_OFF */ ; line:/* (simple_text) => paragraph^ | */ (headerline) => headerline^ | (listmarker) => listline^ | (hrline) => hrline^ | (spaceline) => spaceline^ | paragraph^ ; ////////////////////////// Lists //////////////////////////////// listline: bullet_list_item |ordered_list_item |indent_list_item |definition_item ; listprefix: (listmarker)+; bullet_list_item: ASTERISK ( (listmarker) => listline -> ^(UL listline) | inline_text -> ^(UL inline_text) | -> ^(UL) ); ordered_list_item: HASH ( (listmarker) => listline -> ^(OL listline) | inline_text -> ^(OL inline_text) | -> ^(OL) ); indent_list_item: COLON ( (listmarker) => listline -> ^(DD listline) | ws? inline_text -> ^(DD inline_text) | -> ^(DD) ); definition_item @init {in_defined_term = true;}: SEMICOLON ws? ( term=inline_text {in_defined_term=false;} ( (COLON) => COLON ws? def=inline_text -> ^(DT $term ^(DD $def)) | (N COLON) => N indent_list_item -> ^(DT $term indent_list_item) | -> ^(DT $term) ) | -> ^(DT) ); finally {in_defined_term = false;} listmarker: HASH | ASTERISK | COLON | SEMICOLON; /////////////////////////// Space blocks /////////////////// spaceline @init {literal_whitespace = true;} : SPACE printing_ws? inline_text? -> ^(PRE printing_ws? inline_text); finally {literal_whitespace = false;} ////////////////////////// Headers ///////////////////////////////// headerline:/* {this.in_header = true;}*/ ( (header6) => header6^ | (header5) => header5^ | (header4) => header4^ | (header3) => header3^ | (header2) => header2^ | (header1) => header1^) /* {this.in_header = false;} */ ; header6: EQUALS EQUALS EQUALS EQUALS EQUALS EQUALS a+=EQUALS* header_simple_text b+=EQUALS* EQUALS EQUALS EQUALS EQUALS EQUALS EQUALS -> ^(H6 $a* header_simple_text $b*); header5: EQUALS EQUALS EQUALS EQUALS EQUALS a+=EQUALS* header_simple_text b+=EQUALS* EQUALS EQUALS EQUALS EQUALS EQUALS -> ^(H5 $a* header_simple_text $b*); header4: EQUALS EQUALS EQUALS EQUALS a+=EQUALS* header_simple_text b+=EQUALS* EQUALS EQUALS EQUALS EQUALS -> ^(H4 $a* header_simple_text $b*); header3: EQUALS EQUALS EQUALS a+=EQUALS* header_simple_text b+=EQUALS* EQUALS EQUALS EQUALS -> ^(H3 $a* header_simple_text $b*); header2: EQUALS EQUALS a+=EQUALS* header_simple_text b+=EQUALS* EQUALS EQUALS -> ^(H2 $a* header_simple_text $b*); header1: EQUALS a+=EQUALS* header_simple_text b+=EQUALS* EQUALS -> ^(H1 $a* header_simple_text $b*); hrline: HYPHEN HYPHEN HYPHEN HYPHEN HYPHEN* -> ^(HR); /////////////////////////////////////////////////////////////////////// internal_link: link_start pagename (PIPE link_caption)? link_end ((letters)=>link_trail)? -> ^(INTERNAL_LINK pagename ^(TEXT link_caption? link_trail?)); link_caption @init {this.caption_levels++; in_internal_link_caption = true;}: simple_text; finally {this.caption_levels--; in_internal_link_caption = false;} link_trail: letters; //////////////////////////////////////////////////////////////////////// ////////////////////////// Images ////////////////////////////////////////// imageinline: (link_start image_namespace) => link_start image_namespace COLON ws? imagename ( PIPE optionorcaption )* link_end -> ^(IMG ^(TEXT image_namespace COLON imagename) optionorcaption*); imagename: pagename ws? DOT ws? imageextension; /* Future passes/actions etc can readily retrieve the extension text, so just validate for now? */ imageextension: {textis("jpeg") | textis("jpg") | textis("png") | textis("svg") | textis("gif") | textis("bmp")}? letters; optionorcaption : (imagemodeautothumb (PIPE | link_end)) => imagemodeautothumb /* move it up here as it's so common */ | (SPACE | punctuation) => image_caption | (imageoption (PIPE | link_end)) => imageoption | image_caption; image_caption @init {this.caption_levels++;} : inline_text? -> ^(TEXT inline_text); finally {this.caption_levels-- ;} imageoption: imagemodemanualthumb | imagemodeautothumb | imagemodeframe | imagemodeframeless //| imagemodepage /* something weird about this one but I don't know what. */ | imagemodeupright | imagemodeborder | imagesizeparameter | imagealignleft | imagealigncenter | imagealignright | imagealignnone | imagevalignbaseline | imagevalignsub | imagevalignsuper | imagevaligntop | imagevaligntexttop | imagevalignmiddle | imagevalignbottom | imagevaligntextbottom; imagemodemanualthumb: mw_img_manualthumb; imagemodeautothumb: mw_img_thumbnail; imagemodeframe: mw_img_frame; imagemodeframeless: mw_img_frameless; imagemodepage: mw_img_page; imagemodeupright: mw_img_upright; imagemodeborder: mw_img_border; imagesizeparameter: positive_int mw_img_width; imagealignleft: mw_img_left ; imagealigncenter: mw_img_center ; imagealignright: mw_img_right ; imagealignnone: mw_img_none; imagevalignbaseline: mw_img_baseline ; imagevalignsub: mw_img_sub; imagevalignsuper: mw_img_super; imagevaligntop: mw_img_top; imagevaligntexttop: mw_img_text_top; imagevalignmiddle: mw_img_middle; imagevalignbottom: mw_img_bottom; imagevaligntextbottom: mw_img_text_bottom; /* default settings: */ /* Hmm, user-definable grammar seems to be a bad idea. Assume that the img_manualthumb is always something followed by the name. */ mw_img_manualthumb : {textis("thumbnail") | textis("thumb")}? mwletters EQUALS imagename -> ^(IMG_OPTION_THUMBNAIL imagename); mw_img_thumbnail : {textis("thumbnail") | textis("thumb")}? mwletters -> ^(IMG_OPTION_THUMBNAIL); mw_img_frame : {textis("framed") | textis("enframed") | textis("frame")}? mwletters; //'framed' | 'enframed' | 'frame'; mw_img_frameless : {textis("frameless")}? mwletters; mw_img_page : {textis("page")}? mwletters (SPACE | EQUALS) mwletters; //'page=$1' | 'page $1' ; /*??? (where is this used?);*/ mw_img_upright : {textis("upright")}? mwletters EQUALS? positive_int?; //'upright' ( '='? POSITIVE_INT)?; mw_img_border : {textis("border")}? mwletters; mw_img_width : {textis("px")}? mwletters; mw_img_baseline : {textis("baseline")}? mwletters; mw_img_sub : {textis("sub")}? mwletters; mw_img_super : {textis("super") | textis("sup")}? mwletters; mw_img_top : {textis("top")}? mwletters; mw_img_text_top : {textis("text-top")}? mwletters; mw_img_middle : {textis("middle")}? mwletters; mw_img_bottom : {textis("bottom")}? mwletters; mw_img_text_bottom : {textis("text-bottom")}? mwletters; mw_img_left : {textis("left")}? mwletters; mw_img_center : {textis("center") | textis("centre")}? mwletters; mw_img_right : {textis("right")}? mwletters; mw_img_none : {textis("none")}? mwletters; image_namespace : {textis(_mw_image_namespace)}? mwletters; ///////////////////////////// external links /////////////////////////////// external_link: bald_url -> ^(EXTERNAL_LINK ^(TEXT bald_url) ^(TEXT bald_url)) //attempt to use url as caption | explicit_url -> ^(EXTERNAL_LINK explicit_url); bald_url: protocol COLON SLASH SLASH letters DOT letters // http://foo.com (minimum) ((DOT letters)=>DOT letters)* // .lom.wom ... ((SLASH letters)=>SLASH letters)* // /thing/other/docs ... (SLASH)=>SLASH? // / ; explicit_url: LEFT_BRACKET bald_url (ws external_link_caption)? RIGHT_BRACKET -> ^(TEXT bald_url) ^(TEXT external_link_caption?); protocol: {textis("ftp") | textis("http")}? letters; external_link_caption @init {this.in_external_link=true;} : simple_text; finally {this.in_external_link=false;} //////////////////////////// magic links //////////////////////////////// magic_link: isbn_link | pmid_link | rfc_link; accidental_magic_link: isbn_accidental | pmid_accidental | rfc_accidental; isbn_link: ISBN_LINK -> ^(ISBN ISBN_LINK); isbn_accidental: ISBN_LINK -> ^(TEXT ISBN_LINK); // the TEXT node is possibly superfluous? rfc_link: RFC_LINK -> ^(RFC RFC_LINK); rfc_accidental: RFC_LINK -> ^(TEXT RFC_LINK); pmid_link: PMID_LINK -> ^(PMID PMID_LINK); pmid_accidental: PMID_LINK -> ^(TEXT PMID_LINK); //////////////////////////////////////////////////////////////////////// paragraph: ws? /* !!!! */ inline_text -> ^(P inline_text); inline_text @init { text_levels++; } : ( /*(complex_inline_elem | simple_inline_elem) =>*/ /* (complex_inline_elem |simple_inline_elem ) ws?*/ // [[http://foo.com]] has to be treated as: [, [http;//foo.com], ] ((LEFT_BRACKET LEFT_BRACKET LEFT_BRACKET) => literal_left_bracket // try and save it some time on [[[foo]]]? |(literal_left_bracket explicit_url) => literal_left_bracket |/*(imageinline) =>*/ imageinline |(external_link) => external_link |(internal_link) => internal_link |(magic_link) => magic_link |pre_block |(simple_inline_elem) =>simple_inline_elem ) ((nbsp_before_punctuation) => nbsp_before_punctuation)* ((ws) =>printing_ws)? )+; finally { text_levels --;} simple_text @init { text_levels++; } : ( (simple_inline_elem) => simple_inline_elem // (nbsp_before_punctuation)? ((printing_ws) => printing_ws)? //ws? )+; finally { text_levels --; } simple_inline_elem: ( (accidental_magic_link) => accidental_magic_link | punctuation_before_nbsp |(APOSTROPHES) => bold_and_italics | ((nbsp_before_punctuation) => nbsp_before_punctuation)+ | really_basic_elem ); pre_block: PRE_OPEN pre_block_body PRE_CLOSE -> ^(PRE pre_block_body); pre_block_body: (pre_ws* really_basic_elem*)+; really_basic_elem: (html_dangerous |punctuation /* if punctuation+, risk of swallowing too many characters: [[[foo.jpg]]] needs to swallow just one */ |letters |digits ); /*textline: simple_text -> ^(P simple_text);*/ /////////////////////////////////////////////////////////////////////////// bold_and_italics: {textis("''") && text_italics}? => APOSTROPHES {text_italics=false;} -> ^(I_OFF) |{textis("''") && !text_italics}? => APOSTROPHES {text_italics=true;} -> ^(I_ON) |{textis("'''") && text_bold}? => APOSTROPHES {text_bold=false;} -> ^(B_OFF) |{textis("'''") && !text_bold}? => APOSTROPHES {text_bold=true;} -> ^(B_ON) |{textis("''''") && text_bold}? => APOSTROPHES {text_bold=false;} -> APOSTROPHE ^(B_OFF) |{textis("''''") && !text_bold}? => APOSTROPHES {text_bold=true;} -> APOSTROPHE ^(B_ON) |{textis("'''''") && text_bold && text_italics}? => APOSTROPHES {text_bold=false; text_italics=false; } -> ^(B_OFF) ^(I_OFF) |{textis("'''''") && text_bold && !text_italics}? => APOSTROPHES {text_bold=false; text_italics=true; } -> ^(B_OFF) ^(I_ON) |{textis("'''''") && !text_bold && text_italics}? => APOSTROPHES {text_bold=true; text_italics=false; } -> ^(B_ON) ^(I_OFF) |{textis("'''''") && !text_bold && !text_italics}? =>APOSTROPHES {text_bold=true; text_italics=true; } -> ^(B_ON) ^(I_ON) // Hopefully we never get more than 6 or less than 2. The lexer should take care of that. ; ////////////////////////Nbsp punctuation///////////////////////////////// nbsp_before_punctuation: SPACE ('»' -> NBSP160 '»' | QUESTION -> NBSP160 QUESTION | COLON -> NBSP160 COLON | SEMICOLON -> NBSP160 SEMICOLON | '!' -> NBSP160 '!' | '%' -> NBSP160 '%' ) ; // SPACE x=('»' | QUESTION | COLON | SEMICOLON | '!' | '%') -> NBSP160 $x; /*{input.LA(2) == '?'}? => */ // (SPACE ('»' | '?' | COLON | SEMICOLON | '!' | '%')) => SPACE -> NBSP160; punctuation_before_nbsp: '«' SPACE -> '«' NBSP160; //«» //////// ////////////////////////////////////////////////////////////////// pagename: pagename_elem ((pagename_elem) => pagename_elem |(SPACE) => SPACE )*; pagename_elem: (letters | accidental_magic_link/* | DIGITS | DOT | UNDERSCORE | HYPHEN | OPEN_PAREN | CLOSE_PAREN*/); /////////////////////////////////// Very basic types /////////////////////////////////////// /* Currently doesn't support equals during a header title...*/ header_simple_text @init {this.in_header=true;}: inline_text; /* Pretty much everything seems to be tolerated in headings. (!) */ finally {this.in_header=false;} // any need for accidental_magic_link? mwletters: letters (letters | HYPHEN | UNDERSCORE | (digits)=>positive_int)*; /////////////////////////////////// Semi-literals, literal sets etc /////////////////////////// punctuation : DOT |digits|COMMA|OPEN_PAREN | CLOSE_PAREN | HASH | HYPHEN | ASTERISK | UNDERSCORE | SLASH | SEMICOLON | APOSTROPHE | QUESTION | literal_left_bracket | literal_right_bracket | literal_pipe | literal_equals | literal_colon | '!' | '@' | '$' | '%' | '^' | '`' | '~' | '\\' | '«' | '»'; // |/*LINK_START |*/ /*LINK_END | */UNKNOWN ; html_dangerous: LT -> ^(ENTITY LT) | GT -> ^(ENTITY GT) | AMP -> ^(ENTITY AMP); letters: (LETTERS); positive_int: digits; /* needs to be refined to remove 0s at start */ literal_link_end: {caption_levels==0}? => link_end; // Strangely enough, a literal pipe has to be allowed in an internal link caption: [[foo|bar|wa]] // It would be good if this behaviour were proscribed to allow for future options literal_pipe: {caption_levels==0 || in_internal_link_caption}? => PIPE; /* Three ways of getting a literal right bracket: 1) You're neither in an external nor internal link: foo] 2) You're in an internal link, and not followed by another right bracket: [[Boop|here] see?]] 3) You're in a nowiki block: [http://square.bracket.com The <nowiki>] foundation.] */ literal_right_bracket: {!in_external_link && (caption_levels == 0 || input.LA(2)!= RIGHT_BRACKET)}? => RIGHT_BRACKET | NOWIKI_RIGHT_BRACKET; /* {!in_external_link}? => RIGHT_BRACKET | NOWIKI_RIGHT_BRACKET;*/ literal_left_bracket: LEFT_BRACKET | NOWIKI_LEFT_BRACKET; /* Dodgy - doesn't really know whether it's a literal left bracket or not */ literal_colon: {!in_defined_term || text_levels > 1}? => COLON /* ;foo:blah is special. ;foo[blah|bl:ah] is not special. TODO: make sure this doesn't break namespaces in defs */ | NOWIKI_COLON; // Only supports a single =. So no ==foo==blah==. literal_equals: {!in_header || input.LA(2) != EQUALS}? => EQUALS ; link_start: LEFT_BRACKET LEFT_BRACKET; link_end: RIGHT_BRACKET RIGHT_BRACKET; // TODO: apparently image captions always treat spaces literally... printing_ws: {literal_whitespace && text_levels <= 1}? => (pre_ws) => pre_ws | ws -> NBSP; digits: digit+; pre_ws: pre_ws_elem+; pre_ws_elem: SPACE -> SPACE | NOWIKI -> | NOWIKI_OFF ->; ws: (SPACE | NOWIKI! | NOWIKI_OFF!)+ ; //----------------------------------------------------- /* ISBN magic links. Care will be needed to make sure they're treated as literals wherever they aren't supported. */ // Broken example: [http://ISBN 1234567890] - current parser does correctly. But does it matter? ISBN_LINK: {!this.in_noparse}? => // Parser.php l081, ~DIGIT is actually regexp \b ((ISBN_LINK_ACTUAL ~DIGIT) => ISBN_LINK_ACTUAL | LETTER { $type=LETTERS; } ); fragment ISBN_LINK_ACTUAL: 'ISBN' ' '+ ('97' ('8' | '9'))? ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? '0'..'9') ((' ' | '-')? ('0'..'9' | 'X' | 'x')); RFC_LINK: {!this.in_noparse}? => ((RFC_LINK_ACTUAL) => RFC_LINK_ACTUAL | LETTER { $type=LETTERS; } ); fragment RFC_LINK_ACTUAL: 'RFC' ' '+ ('0'..'9')+; PMID_LINK : {!this.in_noparse}? => ((PMID_LINK_ACTUAL) => PMID_LINK_ACTUAL | LETTER { $type=LETTERS; } ); fragment PMID_LINK_ACTUAL: 'PMID' ' '+ ('0'..'9')+; ///////////// / NOWIKI: {!this.in_noparse}? => ((NOWIKI_ACTUAL) => NOWIKI_ACTUAL { this.in_nowiki=true; this.in_noparse=true;} | '<' { $type=LT; } ); fragment NOWIKI_ACTUAL: '' ; NOWIKI_OFF: {this.in_nowiki}? => ((NOWIKI_OFF_ACTUAL) => NOWIKI_OFF_ACTUAL { this.in_nowiki=false; } | '<' { $type=LT; } ); fragment NOWIKI_OFF_ACTUAL: '' ; /////////// <pre> /
PRE_OPEN: {!this.in_noparse}? =>
((PRE_OPEN_ACTUAL) => PRE_OPEN_ACTUAL { this.in_pre=true; this.in_noparse=true;} | '<' { $type=LT; } );
fragment
PRE_OPEN_ACTUAL: '
' ; PRE_CLOSE: {this.in_pre}? => ((PRE_CLOSE_ACTUAL) => PRE_CLOSE_ACTUAL { this.in_pre=false; this.in_noparse=false; } | '<' { $type=LT; } ); fragment PRE_CLOSE_ACTUAL: '
' ;
LT: '<';
GT: '>';
AMP: '&';
//NOWIKI : {!this.in_nowiki}? => '<' 'nowiki' >'{ this.in_nowiki=true;} /*->*/ ; /* doesn't render as anything in particular */ //NOWIKI_OFF: {this.in_nowiki}? => '</nowiki>'{ this.in_nowiki=false;} /*->*/ ; /* NOWIKI: {
!this.in_nowiki && input.LA(1)=='<' && input.LA(2)=='n' && input.LA(3)=='o' && input.LA(4)=='w' && input.LA(5)=='i' && input.LA(6)=='k' && input.LA(7)=='i' && input.LA(8)=='>' }? => '' { this.in_nowiki=true;}; */ /*NOWIKI_OFF: { this.in_nowiki && input.LA(1)=='<' && input.LA(2)=='/' && input.LA(3)=='n' && input.LA(4)=='o' && input.LA(5)=='w' && input.LA(6)=='i' && input.LA(7)=='k' && input.LA(8)=='i' && input.LA(9)=='>' }? => '' { this.in_nowiki=false;};
- /
/* NOWIKI : {!this.in_nowiki}? => LT 'nowiki' GT { this.in_nowiki=true;} ; NOWIKI_OFF: {this.in_nowiki}? => LT '/nowiki' GT { this.in_nowiki=false;};*/
/* Can't make them tokens because of stupid [[1]] */
/*
LINK_START: {!this.in_nowiki}? => '[['; LINK_END: {!this.in_nowiki}? => ']]';
- /
PIPE: {!this.in_noparse}? => '|';
/*PRESPACE: { !this.in_nowiki && this.getCharPositionInLine()==0 }? => ' ';
SPACE: {!(!this.in_nowiki && this.getCharPositionInLine()==0)}? => ' '+;*/
SPACE: ' ';
DOT : '.'; EQUALS : '='; UNDERSCORE : '_'; HYPHEN : '-'; COMMA : ','; OPEN_PAREN : '('; CLOSE_PAREN : ')'; SEMICOLON : ';'; QUESTION : '?';
/* It's a literal apostrophe if either the next character is *not* an apostrophe, or the next 5 characters *are* apostrophes. Yummy. */ fragment APOS : '\; APOSTROPHE : {
input.LA(1)=='\ && ( this.in_noparse || ( input.LA(2)!='\ || input.LA(3)=='\ && input.LA(4)=='\ && input.LA(5)=='\ && input.LA(6)=='\ ) )
}? => APOS;
/* It's a swarm of apostrophes if it is not the case that this and the next five characters are apostrophes, and there are at least two, and we're not in a nowiki.*/ APOSTROPHES : {
!this.in_noparse && input.LA(1)=='\ && !( input.LA(2)=='\ && input.LA(3)=='\ && input.LA(4)=='\ && input.LA(5)=='\ && input.LA(6)=='\ )
}? => APOS APOS+ ;
LEFT_BRACKET : {!this.in_noparse}? => '['; NOWIKI_LEFT_BRACKET:
{this.in_noparse}? => '[';
RIGHT_BRACKET : {!this.in_noparse}? => ']'; NOWIKI_RIGHT_BRACKET:
{this.in_noparse}? => ']';
COLON : {!this.in_noparse}? => ':'; NOWIKI_COLON : {this.in_noparse}? => ':'; HASH : {!this.in_noparse}? => '#'; ASTERISK : {!this.in_noparse}? => '*'; SLASH : {!this.in_noparse}? => '/';
//fragment //DIGIT: D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9; DIGIT: '0'..'9'; //DIGITS: DIGIT+; digit: DIGIT;
fragment LETTER : ('A'..'Z'|'a'..'z'); //HTTP: 'h' 't' 't' 'p';
LETTERS : LETTER+; // {if ($text.equals("http") || $text.equals("ftp")) $type=PROTOCOL;};
//HTML : '<html>' .* '</html>' /*-> ^(HTML $x)*/ ;
N : '\r'? '\n' {setText("\\n\n");};
//UNKNOWN : .;