Markup spec/OCaml
Appearance
(Redirected from MediaWiki lexer)
This page is obsolete. It is being retained for archival purposes. It may document extensions or features that are obsolete and/or no longer supported. Do not rely on the information here being up-to-date. |
Draft lexer for the MediaWiki markup implemented in OCaml.
Other wikis have benefited from using a real lexer instead of a series of regular expressions. For example, the Lexer from Wookee engine for UseModWiki (which is unfortunately written in Perl, not PHP) could be even quite usable if html attributes parsing was added. (The Parser, however, is questionable).
Later some proof-of-concept parser could be made. It should probably generate some AST, not (X)HTML directly, so it can be used in many useful bots too. Or have 2 modes - AST for bots and HTML for maximum performance.
Idea
[edit]- New Line
- End of File
- whitespace +
- (\* | # )+ (only at linestart)
- ={1,6} (at linestart or lineend)
- [ [ articlenamespecification]]
- [ [ articlenamespecification|
- ] ]
- '''''
- '''
- ''
- ----+
- ;
- :
- ~~~
- ~~~~
- urlspecification
- [ urlspecification
- ]
- <pre> anything_but_close_pre </pre>
- <nowiki> anything_but_close_nowiki </nowiki>
- <math> anything_but_close_math </math>
- <!-- anything_but_close_html_comment -->
- ISBN whitespace [0-9X-]+
- RFC whitespace \d+
- (one token per valid HTML tag)
- & entityspecification ;
- anyothercharacter
- variables (+ magic to parse "articlename" for variables to emulate current multipass parser)
Regular expressions used:
- articlenamespecification =Â ???
- entityspecification =Â ???
- urlspecification =Â ???
- whitespace = [\s|\t]
Code
[edit]Mini-lexer has 4 files lexer.mll
, util.ml
, tokens.ml
and main.ml
(which is kind of trivial parser). It's incomplete:
lexer.mll
[edit]{
open Tokens
open Util
}
let anything = ['a'-'z''A'-'Z''0'-'9''\\128'-'\\255']+ | ['\\000'-'\\255']
let anything_but_close_math =
( [^'<']
| '<' [^'/']
| "</" [^'m']
| "</m" [^'a']
| "</ma" [^'t']
| "</mat" [^'h']
| "</math" [^'>']
) +
let anything_but_close_pre =
( [^'<']
| '<' [^'/']
| "</" [^'p']
| "</p" [^'r']
| "</pr" [^'e']
| "</pre" [^'>']
) +
let anything_but_close_nowiki =
( [^'<']
| '<' [^'/']
| "</" [^'n']
| "</n" [^'o']
| "</no" [^'w']
| "</now" [^'i']
| "</nowi" [^'k']
| "</nowik" [^'i']
| "</nowiki" [^'>']
) +
let anything_but_close_comment =
( [^'-']
| '-' [^'-']
| "--" [^'>']
) +
let whitespace = [' ''\\t']
let digit = ['0'-'9']
let hexdigit = ['0'-'9''a'-'f''A'-'F']
let alphanum = ['a'-'z''A'-'Z''0'-'9']
let alpha = ['a'-'z''A'-'Z']
let entity_named = "&" alphanum + ";"
let entity_dec = "&#" digit + ";"
let entity_hex = "&#x" hexdigit + ";"
let html_space = [' ''\\t''\\r''\\n']
let html_space_opt = html_space *
let html_attr_unquoted = ['a'-'z' 'A'-'Z' '0'-'9' '_' ',' ':' '-'] +
let html_attr_arg = ('\\'' [^'\\''] * '\\'' | '"' [^'"'] * '"' | html_attr_unquoted)
let html_attr_name = alpha +
let html_attr = html_attr_name html_space * "=" html_space * html_attr_arg
let html_attrs = (html_space + html_attr) *
let html_tag_name = alpha alphanum *
let html_opening_tag = "<" html_tag_name html_attrs html_space_opt ">"
let html_closing_tag = "</" html_tag_name html_space_opt ">"
let html_closed_tag = "<" html_tag_name html_space_opt "/>"
let articlename = [' ''-''a'-'z''A'-'Z''0'-'9'':''_''+'',''.'' ''{''}''\\128'-'\\255']+
rule token = parse
'\\n'
{ NL }
| '\\r'
{ token lexbuf }
| whitespace +
{ SP (Lexing.lexeme lexbuf) }
| ['#''*'] +
{ LIST (Lexing.lexeme lexbuf) }
| "=" +
{ EQ (String.length (Lexing.lexeme lexbuf)) }
| '\\'' '\\'' +
{ Q (String.length (Lexing.lexeme lexbuf)) }
| "----" '-' *
{ HR }
| "<pre>" anything_but_close_pre "</pre>"
{ PRE (string_brange (Lexing.lexeme lexbuf) 5 6) }
| "<nowiki>" anything_but_close_nowiki "</nowiki>"
{ NOWIKI (string_brange (Lexing.lexeme lexbuf) 8 9) }
| "<math>" anything_but_close_math "</math>"
{ MATH (string_brange (Lexing.lexeme lexbuf) 6 7) }
| "<!--" anything_but_close_comment "-->"
{ token lexbuf }
| "[[" articlename "|"
{ LINK (string_brange (Lexing.lexeme lexbuf) 2 1) }
| "[[de:" articlename "]]"
{ LINK_INTERWIKI ("de", string_brange (Lexing.lexeme lexbuf) 5 2) }
| "[[en:" articlename "]]"
{ LINK_INTERWIKI ("en", string_brange (Lexing.lexeme lexbuf) 5 2) }
| "[[eo:" articlename "]]"
{ LINK_INTERWIKI ("eo", string_brange (Lexing.lexeme lexbuf) 5 2) }
| "[[fr:" articlename "]]"
{ LINK_INTERWIKI ("fr", string_brange (Lexing.lexeme lexbuf) 5 2) }
| "[[pl:" articlename "]]"
{ LINK_INTERWIKI ("pl", string_brange (Lexing.lexeme lexbuf) 5 2) }
| "[[" articlename "]]"
{ LINK_DEFAULT (string_brange (Lexing.lexeme lexbuf) 2 2) }
| "]]"
{ LINK_CLOSE }
| "{{CURRENTMONTH}}"
{ LEAF VAR_CURRENTMONTH }
| "{{CURRENTDAY}}"
{ LEAF VAR_CURRENTDAY }
| "{{CURRENTYEAR}}"
{ LEAF VAR_CURRENTYEAR }
| "{{CURRENTDAYNAME}}"
{ LEAF VAR_CURRENTDAYNAME }
| "{{CURRENTTIME}}"
{ LEAF VAR_CURRENTTIME }
| "{{NUMBEROFARTICLES}}"
{ LEAF VAR_NUMBEROFARTICLES }
| ":"
{ COLON }
| ";"
{ SEMI }
| "<nowiki>~~~</nowiki>"
{ LEAF T3 }
| "<nowiki>~~~~</nowiki>"
{ LEAF T4 }
| "RFC" " " ? digit +
{ LEAF (RFC) }
| "ISBN" " " ? ['0'-'9''X''-'] +
{ LEAF (ISBN) }
| entity_dec
{ LEAF (ENT_DEC (Lexing.lexeme lexbuf)) }
| entity_hex
{ LEAF (ENT_HEX (Lexing.lexeme lexbuf)) }
| entity_named
{ LEAF (ENT_NAMED (Lexing.lexeme lexbuf)) }
| html_opening_tag
{ parse_html_opening_tag (Lexing.lexeme lexbuf) }
| html_closing_tag
{ parse_html_closing_tag (Lexing.lexeme lexbuf) }
| html_closed_tag
{ parse_html_closed_tag (Lexing.lexeme lexbuf) }
| anything
{ LEAF (LIT (Lexing.lexeme lexbuf)) }
| eof
{ EOF }
(*
urls and [urls]
HTML and entities - of course there should be parsing and validation here
articlename - needs to parse variables inside, needs to check what
is allowed and what is not
some unicode magic ?
lexeme_length
complete literal match accelerator
interwiki magic
*)
tokens.ml
[edit]type t_leaf = T3 | T4 | LIT of string
| ENT_DEC of string | RFC | ISBN | ENT_HEX of string | ENT_NAMED of string
| VAR_CURRENTMONTH | VAR_CURRENTDAY | VAR_CURRENTYEAR
| VAR_CURRENTDAYNAME | VAR_CURRENTTIME | VAR_NUMBEROFARTICLES
type t = NL | SP of string | LIST of string | EQ of int | Q of int | HR | PRE of string
| NOWIKI of string | MATH of string | LINK of string
| LINK_INTERWIKI of string * string | LINK_DEFAULT of string | LINK_CLOSE
| COLON | SEMI | LEAF of t_leaf
| O_P | O_H1 | O_H2 | O_H3 | O_H4 | O_H5 | O_H6
| C_P | C_H1 | C_H2 | C_H3 | C_H4 | C_H5 | C_H6
| O_UL | O_OL | O_LI | O_TABLE | O_TR | O_TH | O_TD
| C_UL | C_OL | C_LI | C_TABLE | C_TR | C_TH | C_TD
| O_B | O_I | O_EM | O_STRONG
| C_B | C_I | C_EM | C_STRONG
| O_U | O_BIG | O_SMALL | O_SUB | O_SUP
| C_U | C_BIG | C_SMALL | C_SUB | C_SUP
| O_CITE | O_CODE | O_S | O_STRIKE | O_TT | O_VAR
| C_CITE | C_CODE | C_S | C_STRIKE | C_TT | C_VAR
| O_DIV | O_CENTER | O_BLOCKQUOTE | O_CAPTION
| C_DIV | C_CENTER | C_BLOCKQUOTE | C_CAPTION
| O_RUBY | O_RT | O_RB | O_RP | O_DT | O_DD
| C_RUBY | C_RT | C_RB | C_RP | C_DT | C_DD
| Z_BR | Z_HR | Z_TR | Z_TH | Z_TD
| EOF
let find_eotn str n0 =
let rec find_eotn_aux n =
try
match str.[n] with
| '0'..'9'
| 'a'..'z'
| 'A'..'Z' -> find_eotn_aux (n+1)
| _ -> n
with _ -> n
in find_eotn_aux n0
let parse_html_opening_tag str =
let l = String.length str
in let eotn = find_eotn str 1
in let tn = String.lowercase (String.sub str 1 (eotn-1))
in match tn with
"p" -> O_P
| "h1" -> O_H1
| "h2" -> O_H2
| "h3" -> O_H3
| "h4" -> O_H4
| "h5" -> O_H5
| "h6" -> O_H6
| "ul" -> O_UL
| "ol" -> O_OL
| "li" -> O_LI
| "table" -> O_TABLE
| "tr" -> O_TR
| "th" -> O_TH
| "td" -> O_TD
| "b" -> O_B
| "i" -> O_I
| "em" -> O_EM
| "strong" -> O_STRONG
| "u" -> O_U
| "big" -> O_BIG
| "small" -> O_SMALL
| "sub" -> O_SUB
| "sup" -> O_SUP
| "cite" -> O_CITE
| "code" -> O_CODE
| "s" -> O_S
| "strike" -> O_STRIKE
| "tt" -> O_TT
| "var" -> O_VAR
| "div" -> O_DIV
| "center" -> O_CENTER
| "blockquote" -> O_BLOCKQUOTE
| "caption" -> O_CAPTION
| "ruby" -> O_RUBY
| "rt" -> O_RT
| "rb" -> O_RB
| "rp" -> O_RP
| "dt" -> O_DT
| "dd" -> O_DD
| _ -> LEAF (LIT str)
let parse_html_closing_tag str =
let l = String.length str
in let eotn = find_eotn str 2
in let tn = String.lowercase (String.sub str 2 (eotn-2))
in match tn with
"p" -> C_P
| "h1" -> C_H1
| "h2" -> C_H2
| "h3" -> C_H3
| "h4" -> C_H4
| "h5" -> C_H5
| "h6" -> C_H6
| "ul" -> C_UL
| "ol" -> C_OL
| "li" -> C_LI
| "table" -> C_TABLE
| "tr" -> C_TR
| "th" -> C_TH
| "td" -> C_TD
| "b" -> C_B
| "i" -> C_I
| "em" -> C_EM
| "strong" -> C_STRONG
| "u" -> C_U
| "big" -> C_BIG
| "small" -> C_SMALL
| "sub" -> C_SUB
| "sup" -> C_SUP
| "cite" -> C_CITE
| "code" -> C_CODE
| "s" -> C_S
| "strike" -> C_STRIKE
| "tt" -> C_TT
| "var" -> C_VAR
| "div" -> C_DIV
| "center" -> C_CENTER
| "blockquote" -> C_BLOCKQUOTE
| "caption" -> C_CAPTION
| "ruby" -> C_RUBY
| "rt" -> C_RT
| "rb" -> C_RB
| "rp" -> C_RP
| "dt" -> C_DT
| "dd" -> C_DD
| _ -> LEAF (LIT str)
let parse_html_closed_tag str =
let l = String.length str
in let eotn = find_eotn str 1
in let tn = String.lowercase (String.sub str 1 (eotn-1))
in match tn with
"br" -> Z_BR (* validate that attrs is empty *)
| "hr" -> Z_HR (* validate that attrs is empty *)
| "tr" -> Z_TR (* validate attrs *)
| "th" -> Z_TH (* validate attrs *)
| "td" -> Z_TD (* validate attrs *)
| _ -> LEAF (LIT str)
util.ml
[edit]let string_brange str s e =
let n = String.length str
in String.sub str s (n - s - e)
main.ml
[edit]open Tokens
open Printf
let string_of_token = function
LEAF (LIT c) -> "lit " ^ c
| LEAF VAR_CURRENTMONTH -> "{{CURRENTMONTH}}"
| LEAF VAR_CURRENTDAY -> "{{CURRENTDAY}}"
| LEAF VAR_CURRENTYEAR -> "{{CURRENTYEAR}}"
| LEAF VAR_CURRENTDAYNAME -> "{{CURRENTDAYTIME}}"
| LEAF VAR_CURRENTTIME -> "{{CURRENTTIME}}"
| LEAF VAR_NUMBEROFARTICLES -> "{{NUMBEROFARTICLES}}"
| LEAF (ENT_DEC s) -> "&dec; " ^ s
| LEAF (ENT_HEX s) -> "&hex;" ^ s
| LEAF (ENT_NAMED s) -> "&named;" ^ s
| NL -> "\\n"
| SP _ -> "sp"
| EOF -> "eof"
| LIST s -> "list " ^ s
| EQ i -> "eq " ^ (string_of_int i)
| Q i -> "q " ^ (string_of_int i)
| HR -> "----"
| PRE _ -> "<pre>?</pre>"
| NOWIKI _ -> "<nowiki>?</nowiki>"
| MATH _ -> "<math>?</math>"
| LINK_INTERWIKI (w,s) -> "[[" ^ w ^ ":" ^ s ^ "]]"
| LINK s -> "[[" ^ s ^ "|"
| LINK_DEFAULT s -> "[[" ^ s ^ "]]"
| LINK_CLOSE -> "]]"
| COLON -> ":"
| SEMI -> ";"
| LEAF T3 -> "<nowiki>~~~</nowiki>"
| LEAF T4 -> "<nowiki>~~~~</nowiki>"
| Z_BR -> "<br/>"
| Z_HR -> "<hr/>"
| Z_TR -> "<tr/>"
| Z_TD -> "<td/>"
| Z_TH -> "<th/>"
| O_B -> "<b>"
| O_I -> "<i>"
| O_P -> "<p>"
| O_U -> "<u>"
| O_S -> "<s>"
| O_H1 -> "<h1>"
| O_H2 -> "<h2>"
| O_H3 -> "<h3>"
| O_H4 -> "<h4>"
| O_H5 -> "<h5>"
| O_H6 -> "<h6>"
| C_B -> "</b>"
| C_I -> "</i>"
| C_P -> "</p>"
| C_U -> "</u>"
| C_S -> "</s>"
| C_H1 -> "</h1>"
| C_H2 -> "</h2>"
| C_H3 -> "</h3>"
| C_H4 -> "</h4>"
| C_H5 -> "</h5>"
| C_H6 -> "</h6>"
| LEAF (RFC) -> "rfc"
| LEAF (ISBN) -> "isbn"
| O_UL -> "<ul>"
| O_OL -> "<ol>"
| O_LI -> "<li>"
| O_TR -> "<tr>"
| O_TH -> "<th>"
| O_TD -> "<td>"
| O_EM -> "<em>"
| O_TT -> "<tt>"
| O_RT -> "<rt>"
| O_RB -> "<rb>"
| O_RP -> "<rp>"
| O_DD -> "<dd>"
| O_DT -> "<dl>"
| C_UL -> "</ul>"
| C_OL -> "</ol>"
| C_LI -> "</li>"
| C_TR -> "</tr>"
| C_TH -> "</th>"
| C_TD -> "</td>"
| C_EM -> "</em>"
| C_TT -> "</tt>"
| C_RT -> "</rt>"
| C_RB -> "</rb>"
| C_RP -> "</rp>"
| C_DD -> "</dd>"
| C_DT -> "</dl>"
| O_BIG -> "<big>"
| O_SUP -> "<sup>"
| O_SUB -> "<sub>"
| O_VAR -> "<var>"
| O_DIV -> "<div>"
| C_BIG -> "</big>"
| C_SUP -> "</sup>"
| C_SUB -> "</sub>"
| C_VAR -> "</var>"
| C_DIV -> "</div>"
| O_CODE -> "<code>"
| O_CITE -> "<cite>"
| O_RUBY -> "<ruby>"
| C_CODE -> "</code>"
| C_CITE -> "</cite>"
| C_RUBY -> "</ruby>"
| O_SMALL -> "<small>"
| O_STRIKE -> "<strike>"
| O_STRONG -> "<strong>"
| O_CENTER -> "<center>"
| O_CAPTION -> "<caption>"
| O_BLOCKQUOTE -> "<blockquote>"
| C_SMALL -> "</small>"
| C_STRIKE -> "</strike>"
| C_STRONG -> "</strong>"
| C_CENTER -> "</center>"
| C_CAPTION -> "</caption>"
| C_BLOCKQUOTE -> "</blockquote>"
| O_TABLE -> "<table>"
| C_TABLE -> "</table>"
let lexbuf = Lexing.from_channel stdin
(*
let rec print_lexemes () =
let t = Lexer.token lexbuf in
print_string (string_of_token t ^ "\\n");
if t <> EOF
then print_lexemes ()
let _ = print_lexemes ()
*)
let get_token () =
Lexer.token lexbuf
let output_header_interwiki (i,a) = printf "Header interwiki: %s:%s\\n" i a
let output_leaf = function
LIT l -> printf "Lit: %s\\n" l
| T3 -> printf "<nowiki>~~~</nowiki>\\n"
| T4 -> printf "<nowiki>~~~~</nowiki>\\n"
| ENT_DEC e -> printf "Ent dec: %s\\n" e
| RFC -> printf "RFC\\n"
| ISBN -> printf "ISBN\\n"
| ENT_HEX e -> printf "Ent hex: %s\\n" e
| ENT_NAMED e -> printf "Ent nam: %s\\n" e
| VAR_CURRENTMONTH -> printf "{{CURRENTMONTH}}"
| VAR_CURRENTDAY -> printf "{{CURRENTDAY}}"
| VAR_CURRENTYEAR -> printf "{{CURRENTYEAR}}"
| VAR_CURRENTDAYNAME -> printf "{{CURRENTDAYTIME}}"
| VAR_CURRENTTIME -> printf "{{CURRENTTIME}}"
| VAR_NUMBEROFARTICLES -> printf "{{NUMBEROFARTICLES}}"
let output_convert = function
SP s -> printf "sp %s\\n" s
| COLON -> printf ":\\n"
| SEMI -> printf ";\\n"
| EQ i -> printf "= * %d\\n" i
| _ -> failwith "wrong convert"
type line_type = LineNormal
| LinePre of string
| LineEQTry of int
| LineList of string
let rec parse_header sp t =
match sp,t with
Some s,SP s' -> parse_header (Some (s^s')) (get_token ())
| None,SP s -> parse_header (Some s) (get_token ())
| _,NL -> parse_header None (get_token ())
| _,LINK_INTERWIKI (i,a) -> output_header_interwiki (i,a); parse_header None (get_token ())
| None,_ -> parse_line_start t
| Some s,_ -> parse_line_cnt (LinePre s) t
and parse_line_start t =
match t with
SP s -> parse_line_cnt (LinePre s) (get_token ())
| LIST s -> parse_line_cnt (LineList s) (get_token ())
| EQ s -> parse_line_cnt (LineEQTry s) (get_token ())
| NL -> parse_line_start (get_token ())
| _ -> parse_line_cnt LineNormal t
and parse_line_cnt ltyp t =
match t with
LEAF l -> output_leaf l; parse_line_cnt ltyp (get_token ())
| LIST _ -> output_convert t; parse_line_cnt ltyp (get_token ())
| COLON -> output_convert t; parse_line_cnt ltyp (get_token ())
| SEMI -> output_convert t; parse_line_cnt ltyp (get_token ())
| SP _ -> ((match ltyp with
LinePre _ -> output_convert t
| _ -> output_leaf (LIT " ")
); parse_line_cnt ltyp (get_token ()))
| EQ i -> ((match ltyp with
LineEQTry j when i = j -> parse_line_cnt_tryeqf i (get_token ())
| _ -> output_convert t; parse_line_cnt ltyp (get_token ())
); parse_line_cnt ltyp (get_token ()))
and parse_line_cnt_tryeqf i t =
match t with
SP _ -> parse_line_cnt_tryeqf i (get_token ())
| NL -> (* a real header line !!! *) parse_line_start (get_token ())
| _ -> output_convert (EQ i); parse_line_cnt (LineEQTry i) t
let _ = parse_header None (get_token ())