/* $Id: lexer_v3.l,v 1.174 2006/11/26 19:56:12 pi Exp $ */ %{ /* * NAME * lexer_v3.l -- bogofilter's lexical analyzer for message headers * * $Id above reflects corresponding standard lexer. * */ /* 03/19/2004: ** Allowing short tokens, disallowing ! at the end of a word, ** removing $-rule and similar stuff. Numbers allowed. ** All characters in a TOKEN are now the same: MID_CHAR ** IP addresses are no longer collected, so block_on_subnets won't work. */ #include "common.h" #include #include #include "buff.h" #include "charset.h" #include "lexer.h" #include "mime.h" /* for mime_*() */ #include "msgcounts.h" #include "textblock.h" #include "token.h" #include "xmalloc.h" #define YY_DECL token_t yylex(void) YY_DECL; /* declare function */ #define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, result, max_size) #define YY_EXIT_FAILURE EX_ERROR #undef stderr #define stderr dbgout /* for debug & -D options */ static int lineno; #define FLEX_VER(MAJ, MIN, SUB) ( MAJ * 1000 + MIN * 100 + SUB) #ifndef YY_FLEX_SUBMINOR_VERSION #define FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, 0) #else #define FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, YY_FLEX_SUBMINOR_VERSION) #endif #if FLEX_VERSION_BF != 2531 int yylineno; #endif /* Function Prototypes */ static word_t *yy_text(void); static void html_char(void); static void html_reorder(void); static void url_char(void); static void skip_to(char chr); static void yy_unput(const byte *txt, uint len); char yy_get_state(void); void yy_set_state_initial(void); /* Function Definitions */ static word_t *yy_text(void) { static word_t yyt; yyt.text = (byte *)yytext; yyt.leng = yyleng; return &yyt; } %} %option warn %option nodebug debug %option align caseless 8bit %option never-interactive %option noreject noyywrap UINT8 ([01]?[0-9]?[0-9]|2([0-4][0-9]|5[0-5])) BCHARSNOSPC [[:alnum:]()+_,-./:=?#\'] BCHARS [[:alnum:]()+_,-./:=?#\' ] MIME_BOUNDARY {BCHARS}*{BCHARSNOSPC} ID ? CHARSET [[:alnum:]-]+ VERPID [[:alnum:]#-]+[[:digit:]]+[[:alnum:]#-]+ MTYPE [[:blank:]]*[[:alnum:]/-]* NUM [[:digit:]]+ NUM_NUM \ {NUM}\ {NUM} MSG_COUNT ^\".MSG_COUNT\" MID_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`!\-] BOGOLEX_CHAR [^[:blank:][:cntrl:] $ <>;=()&%#@+|/\\{}^\"?,\[\]] TOKEN {MID_CHAR}+ /* RFC2047.2 encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" charset = token ; see section 3 encoding = token ; see section 4 token = 1* especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / " <"> / "/" / "[" / "]" / "?" / "." / "=" encoded-text = 1* ; (but see "Use of encoded-words in message ; headers", section 5) */ /* 09/01/03 Using "[^?]" in the pattern and validating the charset in 'C' reduces executable size by approx 290k. new: ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?= old: ENCODED_WORD =\?{CHARSET}\?(b\?{BASE64}\|q\?{QP})\?= BASE64 [0-9a-zA-Z/+=]+ QP [!->@-~]+ */ WHITESPACE [[:blank:]\n] NOTWHITESPACE [^[:blank:]\n] HTML_ENCODING "&#"x?[[:xdigit:]]+";" URL_ENCODING "%"[[:xdigit:]][[:xdigit:]] ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?\n]*\?= ENCODED_TOKEN ({MID_CHAR}+)?({ENCODED_WORD}{WHITESPACE}+)*{ENCODED_WORD} HTMLTOKEN "<"[^>]*">" /* * Generally, there are some html tags that cause an "eyebreak" and some * that do not. For example, the "P" tag or the "BR" tag cause a break, * and can be interpreted in place, while, the B (bold) tag does not. * No close tags seem to cause a break. * Comments do not. This is an attempt to make an exhaustive list of * tags that cause an "eyebreak". When the exit tag also causes a break, * we include the /?. I believe this to be a complete list of tags that * can cause a formatting break. */ HBREAK p|br|li|h[1-6]|hr|title|table|center|dd|dt|iframe|img|input|select|td|textarea|th|\/?(div|blockquote|pre|dir|dl|fieldset|legend|form|menu|ol|ul) BREAKHTML "<"({HBREAK}({WHITESPACE}[^>]*|""))">" VERP {TOKEN}-{VERPID}-{TOKEN}={TOKEN}@{TOKEN} %s TEXT HTML BOGO_LEX %s HTOKEN HDISCARD SCOMMENT LCOMMENT %% {MSG_COUNT}{NUM_NUM} { if (lineno == 0) { BEGIN BOGO_LEX; set_msg_counts_from_str(strchr(yytext, ' ') + 1); } return MSG_COUNT_LINE; } ^\"{BOGOLEX_CHAR}+\"{NUM_NUM}$ { return BOGO_LEX_LINE; } \n { lineno += 1; } {ENCODED_TOKEN} { word_t *raw = yy_text(); word_t *txt = text_decode(raw); yy_unput(txt->text, txt->leng); } ^(To|CC|From|Return-Path|Subject|Received): { set_tag(yytext); } ^Content-(Transfer-Encoding|Type|Disposition):{MTYPE} { mime_content(yy_text()); skip_to(':'); return TOKEN; } ^(Delivery-)?Date:.* { return HEADKEY; } ^Resent-Message-ID:.* { return HEADKEY; } ^Message-ID:.* { /* save token for logging */ int off = 11; while(isspace((unsigned char)yytext[off]) && off < yyleng) off++; set_msg_id((unsigned char *)(yytext+off), yyleng-off); return HEADKEY; } ^(In-Reply-To|References):.* { return HEADKEY; } ^Status:.* /* ignore */ boundary=[ ]*\"?{MIME_BOUNDARY}\"? { mime_boundary_set(yy_text()); } charset=\"?{CHARSET}\"? { got_charset(yytext); skip_to('='); return TOKEN; } (file)?name=\"? /* ignore */ \n?[[:blank:]]id{WHITESPACE}+{ID} { return QUEUE_ID; } \n[[:blank:]] { lineno += 1; } \n\n { enum mimetype type = get_content_type(); have_body = true; msg_header = false; clr_tag(); switch (type) { case MIME_TEXT_HTML: BEGIN HTML; break; case MIME_MESSAGE: yy_set_state_initial(); break; default: BEGIN TEXT; } return EOH; } \n { set_tag("Header"); lineno += 1; } {VERP} { skip_to('='); return VERP; } ^--{MIME_BOUNDARY}(--)?$ { if (got_mime_boundary(yy_text())) { yy_set_state_initial(); return BOUNDARY; } else { yyless(2); } } /* This has to match just as much or more than the below rules, so as to be the controlling rule. */ {TOKEN}({HTMLTOKEN}*{BREAKHTML}+{HTMLTOKEN}*.?|{HTMLTOKEN}+{WHITESPACE}) { char *chr = memchr(yytext, '<', yyleng); /* find start of html tag */ size_t len = chr - yytext; yyless(len); return TOKEN; } {TOKEN}{HTMLTOKEN}+/{NOTWHITESPACE} { html_reorder(); } "" { BEGIN HTML; } /* end of strict comment; return to normal html processing */ {TOKEN} { return TOKEN;} {TOKEN}?{HTML_ENCODING} { html_char(); } /* process escaped chars, eg 'e' is 'a' */ "/"[^/[:blank:]\n%]*{URL_ENCODING}+ { url_char(); } /* process escaped chars, eg '%61' is 'a' */ . /* ignore character */ \n { lineno += 1; clr_tag(); } <> { return NONE; } %% void lexer_v3_init(FILE *fp) { lineno = 0; have_body = false; yy_set_state_initial(); yyrestart(fp); } static void skip_to(char chr) { size_t len = strchr(yytext, chr) - yytext; yyless(len); } static void html_reorder(void) { char *chr = memchr(yytext, '<', yyleng); /* find start of html tag */ size_t len = chr - yytext; char *yycopy = xmalloc(yyleng + 1); /* +1 for NUL byte below */ memcpy(yycopy, yytext+len, yyleng-len); /* copy tag to start of buffer */ memcpy(yycopy+yyleng-len, yytext, len); /* copy leading text to end of buffer */ yycopy[yyleng] = '\0'; /* for debugging */ yy_unput((byte *)yycopy, yyleng); xfree(yycopy); } static int xtoi(char *in, size_t len) { int val = 0; while (isxdigit((byte) *in) && (len-- > 0)) { char c = *in++; val <<= 4; val += isdigit((byte)c) ? (c - '0') : (tolower((byte)c) - 'a' + 10); } return val; } static void html_char(void) { char *txt = strstr(yytext, "&#"); /* find decodable char */ size_t len = txt - yytext; int val; char *yycopy = NULL; if (len != 0) { yycopy = xmalloc(yyleng + 1); /* +1 for NUL byte below */ memcpy(yycopy, yytext, yyleng); /* copy tag to start of buffer */ yycopy[yyleng] = '\0'; /* for debugging */ } txt += 2; val = isdigit((byte) *txt) ? atoi(txt) : xtoi(txt+1, 4); /* xtoi() limits conversion to 4 characters */ /* atoi() limits value to 0x7fffffff, i.e. 2147483647 */ /* no problem on linux */ if ((val > 0) && (val < 256) && isprint(val)) { /* use it if printable */ yyunput(val, yytext); yyleng = len; /* adjust len to pre-char count */ } else { if (yycopy) yycopy[yyleng-1] = ' '; /* prevents parsing loop */ } if (yycopy != NULL) { yy_unput((byte *)yycopy, yyleng); xfree(yycopy); } } static void url_char(void) { char *src, *dst; src = dst = yytext; while (src < yytext + yyleng) { char c = *src++; if (c == '%') { c = xtoi(src, 2); src += 2; } *dst++ = c; } while (dst > yytext) { yyunput(*--dst, yytext); } } static void yy_unput(const byte *txt, uint len) { while (len-- > 0) yyunput(txt[len], yytext); } char yy_get_state() { switch (YYSTATE) { case INITIAL: return 'i'; case TEXT: return 't'; case HTML: case HTOKEN: return 'h'; case SCOMMENT: return 's'; case LCOMMENT: return 'l'; default: return 'o'; } } void yy_set_state_initial(void) { BEGIN INITIAL; msg_header = true; set_tag("Header"); if (DEBUG_LEXER(1)) fprintf(dbgout, "BEGIN INITIAL\n"); #ifdef FLEX_DEBUG yy_flex_debug = BOGOTEST('L'); #endif } /* * The following sets edit modes for GNU EMACS * Local Variables: * mode:c * indent-tabs-mode:t * End: */