/* $Id: sgml.l,v 1.9 1996/02/07 15:32:28 connolly Exp $ */ /* sgml.l -- a lexical analyzer for Basic+/- SGML Documents * See: "A Lexical Analyzer for HTML and Baisc SGML" */ /* * NOTE: We assume the locale used by lex and the C compiler * agrees with ISO-646-IRV; for example: '1' == 0x31. */ /* Figure 1 -- Character Classes: Abstract Syntax */ Digit [0-9] LCLetter [a-z] Special ['()_,\-\./:=?] UCLetter [A-Z] /* Figure 2 -- Character Classes: Concrete Syntax */ LCNMCHAR [\.-] /* LCNMSTRT [] */ UCNMCHAR [\.-] /* UCNMSTRT [] */ /* @# hmmm. sgml spec says \015 */ RE \n /* @# hmmm. sgml spec says \012 */ RS \r SEPCHAR \011 SPACE \040 /* Figure 3 -- Reference Delimiter Set: General */ COM "--" CRO "&#" DSC "]" DSO "[" ERO "&" ETAGO "" MDO "" PIO "" /* 9.2.1 SGML Character */ /*name_start_character {LCLetter}|{UCLetter}|{LCNMSTRT}|{UCNMSTRT}*/ name_start_character {LCLetter}|{UCLetter} name_character {name_start_character}|{Digit}|{LCNMCHAR}|{UCNMCHAR} /* 9.3 Name */ name {name_start_character}{name_character}* number {Digit}+ number_token {Digit}{name_character}* name_token {name_character}+ /* 6.2.1 Space */ s {SPACE}|{RE}|{RS}|{SEPCHAR} ps ({SPACE}|{RE}|{RS}|{SEPCHAR})+ /* trailing white space */ ws ({SPACE}|{RE}|{RS}|{SEPCHAR})* /* 9.4.5 Reference End */ reference_end ({REFC}|{RE}) /* * 10.1.2 Parameter Literal * 7.9.3 Attribute Value Literal * (we leave recognition of character references and entity references, * and whitespace compression to further processing) * * @# should split this into minimum literal, parameter literal, * @# and attribute value literal. */ literal ({LIT}[^\"]*{LIT})|({LITA}[^\']*{LITA}) /* 9.6.1 Recognition modes */ /* * Recognition modes are represented here by start conditions. * The default start condition, INITIAL, represents the * CON recognition mode. This condition is used to detect markup * while parsing normal data charcters (mixed content). * * The CDATA start condition represents the CON recognition * mode with the restriction that only end-tags are recognized, * as in elements with CDATA declared content. * (@# no way to activate it yet: need hook to parser.) * * The TAG recognition mode is split into two start conditions: * ATTR, for recognizing attribute value list sub-tokens in * start-tags, and TAG for recognizing the TAGC (">") delimiter * in end-tags. * * The MD start condition is used in markup declarations. The COM * start condition is used for comment declarations. * * The DS condition is an approximation of the declaration subset * recognition mode in SGML. As we only use this condition after signalling * an error, it is merely a recovery device. * * The CXT, LIT, PI, and REF recognition modes are not separated out * as start conditions, but handled within the rules of other start * conditions. The GRP mode is not represented here. */ /* EXCERPT ACTIONS: START */ /* %x CON == INITIAL */ %x CDATA %x TAG %x ATTR %x ATTRVAL %x NETDATA %x ENDTAG /* this is only to be permissive with bad end-tags: */ %x JUNKTAG %x MD %x COM %x DS /* EXCERPT ACTIONS: STOP */ %% int *types = NULL; char **strings = NULL; size_t *lengths = NULL; int qty = 0; /* * See sgml_lex.c for description of * ADD, CALLBACK, ERROR, TOK macros. */ /* * 9.6 Delimiter Recognition and * Figure 3 -- Reference Delimiter Set: General * * This is organized by recognition mode: first CON, then TAG, * MD, and DS. Within a mode, the rules are ordered alphabetically * by delimiter name. */ /* < -- numeric character reference */ {CRO}{number}{reference_end}? { reference(yytext, yyleng, SGML_NUMCHARREF, 0, l, tokF, tokObj); } /* <xyz. -- syntax error */ {CRO}{number_token}{reference_end}? { ERROR(SGML_ERROR, "bad character in character reference", yytext, yyleng); } /* &#SPACE; -- named character reference. */ {CRO}{name}{reference_end}? { if (l->restrict) { if (l->compat) /* old-style user agents use it as data. */ TOK(tokF, tokObj, SGML_DATA, yytext, yyleng); else{ ERROR(SGML_LIMITATION, "named character references are not supported", yytext, yyleng); } }else{ reference(yytext, yyleng, SGML_NAMECHARREF, l->normalize, l, tokF, tokObj); } } /* & -- general entity reference */ {ERO}{name}{reference_end}? { reference(yytext, yyleng, SGML_GEREF, 0, l, tokF, tokObj); } /* {ETAGO}{name}?{ws}/{STAGO} { if (l->restrict){ ERROR(SGML_LIMITATION, "unclosed end tag not supported", yytext, yyleng); }else{ ADDCASE(SGML_END, yytext, yyleng); CALLBACK(tokF,tokObj); } } /* -- end tag */ {ETAGO}{name}{ws} { ADDCASE(SGML_END, yytext, yyleng); if (l->restrict && l->compat) { BEGIN(JUNKTAG); }else { BEGIN(ENDTAG); } } /* @# HACK for XMP, LISTING? Date: Fri, 19 Jan 1996 23:13:43 -0800 Message-Id: To: www-html@w3.org Subject: Re: Daniel Connolly's SGML Lex Specification */ /* @@ all these are recognized in NETDATA too. Need a stack? */ /* -- empty end tag */ {ETAGO}{TAGC} { if (l->restrict) { if (l->compat) TOK(tokF, tokObj, SGML_DATA, yytext, yyleng); else ERROR(SGML_LIMITATION, "empty end tag not supported", yytext, yyleng); }else{ TOK2(tokF, tokObj, SGML_START, yytext, yyleng-1, SGML_TAGC, yytext + yyleng - 1, 1); } } /* -- empty comment */ {MDO}{MDC} { TOK(auxF, auxObj, SGML_MARKUP_DECL, yytext, yyleng); } /* -- comment */ {COM}([^-]|-[^-])*{COM}{ws} { ADD(SGML_COMMENT, yytext, yyleng); } /* -- parameter entity reference */ {PERO}{name}{reference_end}?{ws} { if (l->restrict) { ERROR(SGML_LIMITATION, "parameter entity reference not supported", yytext, yyleng); } ADD(SGML_PERO, yytext, yyleng); } /* -- parameter entity definition */ {PERO}{ps} { if (l->restrict) { ERROR(SGML_LIMITATION, "parameter entity definition not supported", yytext, yyleng); } ADD(SGML_PERO, yytext, yyleng); } /* The limited set of markup delcarations we're interested in * use only numbers, names, and literals. */ {number}{ws} { ADD(SGML_NUMBER, yytext, yyleng); } {name}{ws} { ADDCASE(SGML_NAME, yytext, yyleng); } {number_token}{ws} { ADD(SGML_NUMTOKEN, yytext, yyleng); } {name_token}{ws} { ADDCASE(SGML_NMTOKEN, yytext, yyleng); } {literal}{ws} { ADD(SGML_LITERAL, yytext, yyleng); } {MDC} { ADD(SGML_TAGC, yytext, yyleng); CALLBACK(auxF, auxObj); BEGIN(INITIAL); } /* other constructs are errors. */ /* {DSO} { if(l->restrict){ ERROR(SGML_LIMITATION, "declaration subset not supported", yytext, yyleng); } ADD(SGML_DSO, yytext, yyleng); CALLBACK(auxF, auxObj); BEGIN(DS); } . { ERROR(SGML_ERROR, "illegal character in markup declaration", yytext, yyleng); } /* 10.4 Marked Section Declaration */ /* 11.1 Document Type Declaration Subset */ /* Our parsing of declaration subsets is just an error recovery technique: * we attempt to skip them, but we may be fooled by "]"s * inside comments, etc. */ /* ]]> -- marked section end */ {MSC}{MDC} { BEGIN(INITIAL); } /* ] -- declaration subset close */ {DSC} { BEGIN(COM); } [^\]]+ { ERROR(SGML_LIMITATION, "declaration subset: skipping", yytext, yyleng); } /* EXCERPT ACTIONS: STOP */ %%