| Line # | Frequency | Source Line | | 1 | | <?php | | 2 | | /** | | 3 | | * XHTML sanitizer for MediaWiki | | 4 | | * | | 5 | | * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al | | 6 | | * http://www.mediawiki.org/ | | 7 | | * | | 8 | | * This program is free software; you can redistribute it and/or modify | | 9 | | * it under the terms of the GNU General Public License as published by | | 10 | | * the Free Software Foundation; either version 2 of the License, or | | 11 | | * (at your option) any later version. | | 12 | | * | | 13 | | * This program is distributed in the hope that it will be useful, | | 14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of | | 15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | | 16 | | * GNU General Public License for more details. | | 17 | | * | | 18 | | * You should have received a copy of the GNU General Public License along | | 19 | | * with this program; if not, write to the Free Software Foundation, Inc., | | 20 | | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | | 21 | | * http://www.gnu.org/copyleft/gpl.html | | 22 | | * | | 23 | | * @package MediaWiki | | 24 | | * @subpackage Parser | | 25 | | */ | | 26 | | | | 27 | | /** | | 28 | | * Regular expression to match various types of character references in | | 29 | | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences | | 30 | | */ | | 31 | 1 | define( 'MW_CHAR_REFS_REGEX', | | 32 | | '/&([A-Za-z0-9]+); | | 33 | | |&\#([0-9]+); | | 34 | | |&\#x([0-9A-Za-z]+); | | 35 | | |&\#X([0-9A-Za-z]+); | | 36 | 1 | |(&)/x' ); | | 37 | | | | 38 | | /** | | 39 | | * Regular expression to match HTML/XML attribute pairs within a tag. | | 40 | | * Allows some... latitude. | | 41 | | * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes | | 42 | | */ | | 43 | 1 | $attrib = '[A-Za-z0-9]'; | | 44 | 1 | $space = '[\x09\x0a\x0d\x20]'; | | 45 | 1 | define( 'MW_ATTRIBS_REGEX', | | 46 | | "/(?:^|$space)($attrib+) | | 47 | | ($space*=$space* | | 48 | | (?: | | 49 | | # The attribute value: quoted or alone | | 50 | | \"([^<\"]*)\" | | 51 | | | '([^<']*)' | | 52 | | | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) | | 53 | | | (\#[0-9a-fA-F]+) # Technically wrong, but lots of | | 54 | | # colors are specified like this. | | 55 | | # We'll be normalizing it. | | 56 | | ) | | 57 | 1 | )?(?=$space|\$)/sx" ); | | 58 | | | | 59 | | /** | | 60 | | * List of all named character entities defined in HTML 4.01 | | 61 | | * http://www.w3.org/TR/html4/sgml/entities.html | | 62 | | * @private | | 63 | | */ | | 64 | 1 | global $wgHtmlEntities; | | 65 | | $wgHtmlEntities = array( | | 66 | 1 | 'Aacute' => 193, | | 67 | | 'aacute' => 225, | | 68 | | 'Acirc' => 194, | | 69 | | 'acirc' => 226, | | 70 | | 'acute' => 180, | | 71 | | 'AElig' => 198, | | 72 | | 'aelig' => 230, | | 73 | | 'Agrave' => 192, | | 74 | | 'agrave' => 224, | | 75 | | 'alefsym' => 8501, | | 76 | | 'Alpha' => 913, | | 77 | | 'alpha' => 945, | | 78 | | 'amp' => 38, | | 79 | | 'and' => 8743, | | 80 | | 'ang' => 8736, | | 81 | | 'Aring' => 197, | | 82 | | 'aring' => 229, | | 83 | | 'asymp' => 8776, | | 84 | | 'Atilde' => 195, | | 85 | | 'atilde' => 227, | | 86 | | 'Auml' => 196, | | 87 | | 'auml' => 228, | | 88 | | 'bdquo' => 8222, | | 89 | | 'Beta' => 914, | | 90 | | 'beta' => 946, | | 91 | | 'brvbar' => 166, | | 92 | | 'bull' => 8226, | | 93 | | 'cap' => 8745, | | 94 | | 'Ccedil' => 199, | | 95 | | 'ccedil' => 231, | | 96 | | 'cedil' => 184, | | 97 | | 'cent' => 162, | | 98 | | 'Chi' => 935, | | 99 | | 'chi' => 967, | | 100 | | 'circ' => 710, | | 101 | | 'clubs' => 9827, | | 102 | | 'cong' => 8773, | | 103 | | 'copy' => 169, | | 104 | | 'crarr' => 8629, | | 105 | | 'cup' => 8746, | | 106 | | 'curren' => 164, | | 107 | | 'dagger' => 8224, | | 108 | | 'Dagger' => 8225, | | 109 | | 'darr' => 8595, | | 110 | | 'dArr' => 8659, | | 111 | | 'deg' => 176, | | 112 | | 'Delta' => 916, | | 113 | | 'delta' => 948, | | 114 | | 'diams' => 9830, | | 115 | | 'divide' => 247, | | 116 | | 'Eacute' => 201, | | 117 | | 'eacute' => 233, | | 118 | | 'Ecirc' => 202, | | 119 | | 'ecirc' => 234, | | 120 | | 'Egrave' => 200, | | 121 | | 'egrave' => 232, | | 122 | | 'empty' => 8709, | | 123 | | 'emsp' => 8195, | | 124 | | 'ensp' => 8194, | | 125 | | 'Epsilon' => 917, | | 126 | | 'epsilon' => 949, | | 127 | | 'equiv' => 8801, | | 128 | | 'Eta' => 919, | | 129 | | 'eta' => 951, | | 130 | | 'ETH' => 208, | | 131 | | 'eth' => 240, | | 132 | | 'Euml' => 203, | | 133 | | 'euml' => 235, | | 134 | | 'euro' => 8364, | | 135 | | 'exist' => 8707, | | 136 | | 'fnof' => 402, | | 137 | | 'forall' => 8704, | | 138 | | 'frac12' => 189, | | 139 | | 'frac14' => 188, | | 140 | | 'frac34' => 190, | | 141 | | 'frasl' => 8260, | | 142 | | 'Gamma' => 915, | | 143 | | 'gamma' => 947, | | 144 | | 'ge' => 8805, | | 145 | | 'gt' => 62, | | 146 | | 'harr' => 8596, | | 147 | | 'hArr' => 8660, | | 148 | | 'hearts' => 9829, | | 149 | | 'hellip' => 8230, | | 150 | | 'Iacute' => 205, | | 151 | | 'iacute' => 237, | | 152 | | 'Icirc' => 206, | | 153 | | 'icirc' => 238, | | 154 | | 'iexcl' => 161, | | 155 | | 'Igrave' => 204, | | 156 | | 'igrave' => 236, | | 157 | | 'image' => 8465, | | 158 | | 'infin' => 8734, | | 159 | | 'int' => 8747, | | 160 | | 'Iota' => 921, | | 161 | | 'iota' => 953, | | 162 | | 'iquest' => 191, | | 163 | | 'isin' => 8712, | | 164 | | 'Iuml' => 207, | | 165 | | 'iuml' => 239, | | 166 | | 'Kappa' => 922, | | 167 | | 'kappa' => 954, | | 168 | | 'Lambda' => 923, | | 169 | | 'lambda' => 955, | | 170 | | 'lang' => 9001, | | 171 | | 'laquo' => 171, | | 172 | | 'larr' => 8592, | | 173 | | 'lArr' => 8656, | | 174 | | 'lceil' => 8968, | | 175 | | 'ldquo' => 8220, | | 176 | | 'le' => 8804, | | 177 | | 'lfloor' => 8970, | | 178 | | 'lowast' => 8727, | | 179 | | 'loz' => 9674, | | 180 | | 'lrm' => 8206, | | 181 | | 'lsaquo' => 8249, | | 182 | | 'lsquo' => 8216, | | 183 | | 'lt' => 60, | | 184 | | 'macr' => 175, | | 185 | | 'mdash' => 8212, | | 186 | | 'micro' => 181, | | 187 | | 'middot' => 183, | | 188 | | 'minus' => 8722, | | 189 | | 'Mu' => 924, | | 190 | | 'mu' => 956, | | 191 | | 'nabla' => 8711, | | 192 | | 'nbsp' => 160, | | 193 | | 'ndash' => 8211, | | 194 | | 'ne' => 8800, | | 195 | | 'ni' => 8715, | | 196 | | 'not' => 172, | | 197 | | 'notin' => 8713, | | 198 | | 'nsub' => 8836, | | 199 | | 'Ntilde' => 209, | | 200 | | 'ntilde' => 241, | | 201 | | 'Nu' => 925, | | 202 | | 'nu' => 957, | | 203 | | 'Oacute' => 211, | | 204 | | 'oacute' => 243, | | 205 | | 'Ocirc' => 212, | | 206 | | 'ocirc' => 244, | | 207 | | 'OElig' => 338, | | 208 | | 'oelig' => 339, | | 209 | | 'Ograve' => 210, | | 210 | | 'ograve' => 242, | | 211 | | 'oline' => 8254, | | 212 | | 'Omega' => 937, | | 213 | | 'omega' => 969, | | 214 | | 'Omicron' => 927, | | 215 | | 'omicron' => 959, | | 216 | | 'oplus' => 8853, | | 217 | | 'or' => 8744, | | 218 | | 'ordf' => 170, | | 219 | | 'ordm' => 186, | | 220 | | 'Oslash' => 216, | | 221 | | 'oslash' => 248, | | 222 | | 'Otilde' => 213, | | 223 | | 'otilde' => 245, | | 224 | | 'otimes' => 8855, | | 225 | | 'Ouml' => 214, | | 226 | | 'ouml' => 246, | | 227 | | 'para' => 182, | | 228 | | 'part' => 8706, | | 229 | | 'permil' => 8240, | | 230 | | 'perp' => 8869, | | 231 | | 'Phi' => 934, | | 232 | | 'phi' => 966, | | 233 | | 'Pi' => 928, | | 234 | | 'pi' => 960, | | 235 | | 'piv' => 982, | | 236 | | 'plusmn' => 177, | | 237 | | 'pound' => 163, | | 238 | | 'prime' => 8242, | | 239 | | 'Prime' => 8243, | | 240 | | 'prod' => 8719, | | 241 | | 'prop' => 8733, | | 242 | | 'Psi' => 936, | | 243 | | 'psi' => 968, | | 244 | | 'quot' => 34, | | 245 | | 'radic' => 8730, | | 246 | | 'rang' => 9002, | | 247 | | 'raquo' => 187, | | 248 | | 'rarr' => 8594, | | 249 | | 'rArr' => 8658, | | 250 | | 'rceil' => 8969, | | 251 | | 'rdquo' => 8221, | | 252 | | 'real' => 8476, | | 253 | | 'reg' => 174, | | 254 | | 'rfloor' => 8971, | | 255 | | 'Rho' => 929, | | 256 | | 'rho' => 961, | | 257 | | 'rlm' => 8207, | | 258 | | 'rsaquo' => 8250, | | 259 | | 'rsquo' => 8217, | | 260 | | 'sbquo' => 8218, | | 261 | | 'Scaron' => 352, | | 262 | | 'scaron' => 353, | | 263 | | 'sdot' => 8901, | | 264 | | 'sect' => 167, | | 265 | | 'shy' => 173, | | 266 | | 'Sigma' => 931, | | 267 | | 'sigma' => 963, | | 268 | | 'sigmaf' => 962, | | 269 | | 'sim' => 8764, | | 270 | | 'spades' => 9824, | | 271 | | 'sub' => 8834, | | 272 | | 'sube' => 8838, | | 273 | | 'sum' => 8721, | | 274 | | 'sup' => 8835, | | 275 | | 'sup1' => 185, | | 276 | | 'sup2' => 178, | | 277 | | 'sup3' => 179, | | 278 | | 'supe' => 8839, | | 279 | | 'szlig' => 223, | | 280 | | 'Tau' => 932, | | 281 | | 'tau' => 964, | | 282 | | 'there4' => 8756, | | 283 | | 'Theta' => 920, | | 284 | | 'theta' => 952, | | 285 | | 'thetasym' => 977, | | 286 | | 'thinsp' => 8201, | | 287 | | 'THORN' => 222, | | 288 | | 'thorn' => 254, | | 289 | | 'tilde' => 732, | | 290 | | 'times' => 215, | | 291 | | 'trade' => 8482, | | 292 | | 'Uacute' => 218, | | 293 | | 'uacute' => 250, | | 294 | | 'uarr' => 8593, | | 295 | | 'uArr' => 8657, | | 296 | | 'Ucirc' => 219, | | 297 | | 'ucirc' => 251, | | 298 | | 'Ugrave' => 217, | | 299 | | 'ugrave' => 249, | | 300 | | 'uml' => 168, | | 301 | | 'upsih' => 978, | | 302 | | 'Upsilon' => 933, | | 303 | | 'upsilon' => 965, | | 304 | | 'Uuml' => 220, | | 305 | | 'uuml' => 252, | | 306 | | 'weierp' => 8472, | | 307 | | 'Xi' => 926, | | 308 | | 'xi' => 958, | | 309 | | 'Yacute' => 221, | | 310 | | 'yacute' => 253, | | 311 | | 'yen' => 165, | | 312 | | 'Yuml' => 376, | | 313 | | 'yuml' => 255, | | 314 | | 'Zeta' => 918, | | 315 | | 'zeta' => 950, | | 316 | | 'zwj' => 8205, | | 317 | | 'zwnj' => 8204 ); | | 318 | | | | 319 | | /** @package MediaWiki */ | | 320 | | class Sanitizer { | | 321 | | /** | | 322 | | * Cleans up HTML, removes dangerous tags and attributes, and | | 323 | | * removes HTML comments | | 324 | | * @private | | 325 | | * @param string $text | | 326 | | * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values | | 327 | | * @param array $args for the processing callback | | 328 | | * @return string | | 329 | | */ | | 330 | | function removeHTMLtags( $text, $processCallback = null, $args = array() ) { | | 331 | 1 | global $wgUseTidy, $wgUserHtml; | | 332 | 1 | $fname = 'Parser::removeHTMLtags'; | | 333 | 1 | wfProfileIn( $fname ); | | 334 | | | | 335 | 1 | if( $wgUserHtml ) { | | 336 | | $htmlpairs = array( # Tags that must be closed | | 337 | 1 | 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', | | 338 | | 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', | | 339 | | 'strike', 'strong', 'tt', 'var', 'div', 'center', | | 340 | | 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', | | 341 | | 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' | | 342 | | ); | | 343 | | $htmlsingle = array( | | 344 | 1 | 'br', 'hr', 'li', 'dt', 'dd' | | 345 | | ); | | 346 | | $htmlsingleonly = array( # Elements that cannot have close tags | | 347 | 1 | 'br', 'hr' | | 348 | | ); | | 349 | | $htmlnest = array( # Tags that can be nested--?? | | 350 | 1 | 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', | | 351 | | 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' | | 352 | | ); | | 353 | | $tabletags = array( # Can only appear inside table | | 354 | 1 | 'td', 'th', 'tr', | | 355 | | ); | | 356 | | $htmllist = array( # Tags used by list | | 357 | 1 | 'ul','ol', | | 358 | | ); | | 359 | | $listtags = array( # Tags that can appear in a list | | 360 | 1 | 'li', | | 361 | | ); | | 362 | | | | 363 | | } else { | | 364 | | $htmlpairs = array(); | | 365 | | $htmlsingle = array(); | | 366 | | $htmlnest = array(); | | 367 | | $tabletags = array(); | | 368 | | } | | 369 | | | | 370 | 1 | $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); | | 371 | 1 | $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); | | 372 | | | | 373 | | # Remove HTML comments | | 374 | 1 | $text = Sanitizer::removeHTMLcomments( $text ); | | 375 | 1 | $bits = explode( '<', $text ); | | 376 | 1 | $text = array_shift( $bits ); | | 377 | 1 | if(!$wgUseTidy) { | | 378 | | $tagstack = array(); $tablestack = array(); | | 379 | 1 | foreach ( $bits as $x ) { | | 380 | 1 | $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); | | 381 | 1 | preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', | | 382 | 1 | $x, $regs ); | | 383 | 1 | list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; | | 384 | 1 | error_reporting( $prev ); | | 385 | | | | 386 | 1 | $badtag = 0 ; | | 387 | 1 | if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { | | 388 | | # Check our stack | | 389 | 1 | if ( $slash ) { | | 390 | | # Closing a tag... | | 391 | 1 | if( in_array( $t, $htmlsingleonly ) ) { | | 392 | | $badtag = 1; | | 393 | 1 | } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { | | 394 | 1 | if ( in_array($ot, $htmlsingleallowed) ) { | | 395 | | # Pop all elements with an optional close tag | | 396 | | # and see if we find a match below them | | 397 | | $optstack = array(); | | 398 | 1 | array_push ($optstack, $ot); | | 399 | 1 | while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && | | 400 | 1 | in_array($ot, $htmlsingleallowed) ) { | | 401 | | array_push ($optstack, $ot); | | 402 | | } | | 403 | 1 | if ( $t != $ot ) { | | 404 | | # No match. Push the optinal elements back again | | 405 | | $badtag = 1; | | 406 | | while ( $ot = @array_pop( $optstack ) ) { | | 407 | | array_push( $tagstack, $ot ); | | 408 | | } | | 409 | | } | | 410 | | } else { | | 411 | 1 | @array_push( $tagstack, $ot ); | | 412 | | # <li> can be nested in <ul> or <ol>, skip those cases: | | 413 | 1 | if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) { | | 414 | 1 | $badtag = 1; | | 415 | | } | | 416 | | } | | 417 | | } else { | | 418 | 1 | if ( $t == 'table' ) { | | 419 | 1 | $tagstack = array_pop( $tablestack ); | | 420 | | } | |
|