Line # | Frequency | Source Line | 1 | | <?php | 2 | | /** | 3 | | * XHTML sanitizer for MediaWiki | 4 | | * | 5 | | * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al | 6 | | * http://www.mediawiki.org/ | 7 | | * | 8 | | * This program is free software; you can redistribute it and/or modify | 9 | | * it under the terms of the GNU General Public License as published by | 10 | | * the Free Software Foundation; either version 2 of the License, or | 11 | | * (at your option) any later version. | 12 | | * | 13 | | * This program is distributed in the hope that it will be useful, | 14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 16 | | * GNU General Public License for more details. | 17 | | * | 18 | | * You should have received a copy of the GNU General Public License along | 19 | | * with this program; if not, write to the Free Software Foundation, Inc., | 20 | | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | 21 | | * http://www.gnu.org/copyleft/gpl.html | 22 | | * | 23 | | * @package MediaWiki | 24 | | * @subpackage Parser | 25 | | */ | 26 | |
| 27 | | /** | 28 | | * Regular expression to match various types of character references in | 29 | | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences | 30 | | */ | 31 | 1 | define( 'MW_CHAR_REFS_REGEX', | 32 | | '/&([A-Za-z0-9]+); | 33 | | |&\#([0-9]+); | 34 | | |&\#x([0-9A-Za-z]+); | 35 | | |&\#X([0-9A-Za-z]+); | 36 | 1 | |(&)/x' ); | 37 | |
| 38 | | /** | 39 | | * Regular expression to match HTML/XML attribute pairs within a tag. | 40 | | * Allows some... latitude. | 41 | | * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes | 42 | | */ | 43 | 1 | $attrib = '[A-Za-z0-9]'; | 44 | 1 | $space = '[\x09\x0a\x0d\x20]'; | 45 | 1 | define( 'MW_ATTRIBS_REGEX', | 46 | | "/(?:^|$space)($attrib+) | 47 | | ($space*=$space* | 48 | | (?: | 49 | | # The attribute value: quoted or alone | 50 | | \"([^<\"]*)\" | 51 | | | '([^<']*)' | 52 | | | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) | 53 | | | (\#[0-9a-fA-F]+) # Technically wrong, but lots of | 54 | | # colors are specified like this. | 55 | | # We'll be normalizing it. | 56 | | ) | 57 | 1 | )?(?=$space|\$)/sx" ); | 58 | |
| 59 | | /** | 60 | | * List of all named character entities defined in HTML 4.01 | 61 | | * http://www.w3.org/TR/html4/sgml/entities.html | 62 | | * @private | 63 | | */ | 64 | 1 | global $wgHtmlEntities; | 65 | | $wgHtmlEntities = array( | 66 | 1 | 'Aacute' => 193, | 67 | | 'aacute' => 225, | 68 | | 'Acirc' => 194, | 69 | | 'acirc' => 226, | 70 | | 'acute' => 180, | 71 | | 'AElig' => 198, | 72 | | 'aelig' => 230, | 73 | | 'Agrave' => 192, | 74 | | 'agrave' => 224, | 75 | | 'alefsym' => 8501, | 76 | | 'Alpha' => 913, | 77 | | 'alpha' => 945, | 78 | | 'amp' => 38, | 79 | | 'and' => 8743, | 80 | | 'ang' => 8736, | 81 | | 'Aring' => 197, | 82 | | 'aring' => 229, | 83 | | 'asymp' => 8776, | 84 | | 'Atilde' => 195, | 85 | | 'atilde' => 227, | 86 | | 'Auml' => 196, | 87 | | 'auml' => 228, | 88 | | 'bdquo' => 8222, | 89 | | 'Beta' => 914, | 90 | | 'beta' => 946, | 91 | | 'brvbar' => 166, | 92 | | 'bull' => 8226, | 93 | | 'cap' => 8745, | 94 | | 'Ccedil' => 199, | 95 | | 'ccedil' => 231, | 96 | | 'cedil' => 184, | 97 | | 'cent' => 162, | 98 | | 'Chi' => 935, | 99 | | 'chi' => 967, | 100 | | 'circ' => 710, | 101 | | 'clubs' => 9827, | 102 | | 'cong' => 8773, | 103 | | 'copy' => 169, | 104 | | 'crarr' => 8629, | 105 | | 'cup' => 8746, | 106 | | 'curren' => 164, | 107 | | 'dagger' => 8224, | 108 | | 'Dagger' => 8225, | 109 | | 'darr' => 8595, | 110 | | 'dArr' => 8659, | 111 | | 'deg' => 176, | 112 | | 'Delta' => 916, | 113 | | 'delta' => 948, | 114 | | 'diams' => 9830, | 115 | | 'divide' => 247, | 116 | | 'Eacute' => 201, | 117 | | 'eacute' => 233, | 118 | | 'Ecirc' => 202, | 119 | | 'ecirc' => 234, | 120 | | 'Egrave' => 200, | 121 | | 'egrave' => 232, | 122 | | 'empty' => 8709, | 123 | | 'emsp' => 8195, | 124 | | 'ensp' => 8194, | 125 | | 'Epsilon' => 917, | 126 | | 'epsilon' => 949, | 127 | | 'equiv' => 8801, | 128 | | 'Eta' => 919, | 129 | | 'eta' => 951, | 130 | | 'ETH' => 208, | 131 | | 'eth' => 240, | 132 | | 'Euml' => 203, | 133 | | 'euml' => 235, | 134 | | 'euro' => 8364, | 135 | | 'exist' => 8707, | 136 | | 'fnof' => 402, | 137 | | 'forall' => 8704, | 138 | | 'frac12' => 189, | 139 | | 'frac14' => 188, | 140 | | 'frac34' => 190, | 141 | | 'frasl' => 8260, | 142 | | 'Gamma' => 915, | 143 | | 'gamma' => 947, | 144 | | 'ge' => 8805, | 145 | | 'gt' => 62, | 146 | | 'harr' => 8596, | 147 | | 'hArr' => 8660, | 148 | | 'hearts' => 9829, | 149 | | 'hellip' => 8230, | 150 | | 'Iacute' => 205, | 151 | | 'iacute' => 237, | 152 | | 'Icirc' => 206, | 153 | | 'icirc' => 238, | 154 | | 'iexcl' => 161, | 155 | | 'Igrave' => 204, | 156 | | 'igrave' => 236, | 157 | | 'image' => 8465, | 158 | | 'infin' => 8734, | 159 | | 'int' => 8747, | 160 | | 'Iota' => 921, | 161 | | 'iota' => 953, | 162 | | 'iquest' => 191, | 163 | | 'isin' => 8712, | 164 | | 'Iuml' => 207, | 165 | | 'iuml' => 239, | 166 | | 'Kappa' => 922, | 167 | | 'kappa' => 954, | 168 | | 'Lambda' => 923, | 169 | | 'lambda' => 955, | 170 | | 'lang' => 9001, | 171 | | 'laquo' => 171, | 172 | | 'larr' => 8592, | 173 | | 'lArr' => 8656, | 174 | | 'lceil' => 8968, | 175 | | 'ldquo' => 8220, | 176 | | 'le' => 8804, | 177 | | 'lfloor' => 8970, | 178 | | 'lowast' => 8727, | 179 | | 'loz' => 9674, | 180 | | 'lrm' => 8206, | 181 | | 'lsaquo' => 8249, | 182 | | 'lsquo' => 8216, | 183 | | 'lt' => 60, | 184 | | 'macr' => 175, | 185 | | 'mdash' => 8212, | 186 | | 'micro' => 181, | 187 | | 'middot' => 183, | 188 | | 'minus' => 8722, | 189 | | 'Mu' => 924, | 190 | | 'mu' => 956, | 191 | | 'nabla' => 8711, | 192 | | 'nbsp' => 160, | 193 | | 'ndash' => 8211, | 194 | | 'ne' => 8800, | 195 | | 'ni' => 8715, | 196 | | 'not' => 172, | 197 | | 'notin' => 8713, | 198 | | 'nsub' => 8836, | 199 | | 'Ntilde' => 209, | 200 | | 'ntilde' => 241, | 201 | | 'Nu' => 925, | 202 | | 'nu' => 957, | 203 | | 'Oacute' => 211, | 204 | | 'oacute' => 243, | 205 | | 'Ocirc' => 212, | 206 | | 'ocirc' => 244, | 207 | | 'OElig' => 338, | 208 | | 'oelig' => 339, | 209 | | 'Ograve' => 210, | 210 | | 'ograve' => 242, | 211 | | 'oline' => 8254, | 212 | | 'Omega' => 937, | 213 | | 'omega' => 969, | 214 | | 'Omicron' => 927, | 215 | | 'omicron' => 959, | 216 | | 'oplus' => 8853, | 217 | | 'or' => 8744, | 218 | | 'ordf' => 170, | 219 | | 'ordm' => 186, | 220 | | 'Oslash' => 216, | 221 | | 'oslash' => 248, | 222 | | 'Otilde' => 213, | 223 | | 'otilde' => 245, | 224 | | 'otimes' => 8855, | 225 | | 'Ouml' => 214, | 226 | | 'ouml' => 246, | 227 | | 'para' => 182, | 228 | | 'part' => 8706, | 229 | | 'permil' => 8240, | 230 | | 'perp' => 8869, | 231 | | 'Phi' => 934, | 232 | | 'phi' => 966, | 233 | | 'Pi' => 928, | 234 | | 'pi' => 960, | 235 | | 'piv' => 982, | 236 | | 'plusmn' => 177, | 237 | | 'pound' => 163, | 238 | | 'prime' => 8242, | 239 | | 'Prime' => 8243, | 240 | | 'prod' => 8719, | 241 | | 'prop' => 8733, | 242 | | 'Psi' => 936, | 243 | | 'psi' => 968, | 244 | | 'quot' => 34, | 245 | | 'radic' => 8730, | 246 | | 'rang' => 9002, | 247 | | 'raquo' => 187, | 248 | | 'rarr' => 8594, | 249 | | 'rArr' => 8658, | 250 | | 'rceil' => 8969, | 251 | | 'rdquo' => 8221, | 252 | | 'real' => 8476, | 253 | | 'reg' => 174, | 254 | | 'rfloor' => 8971, | 255 | | 'Rho' => 929, | 256 | | 'rho' => 961, | 257 | | 'rlm' => 8207, | 258 | | 'rsaquo' => 8250, | 259 | | 'rsquo' => 8217, | 260 | | 'sbquo' => 8218, | 261 | | 'Scaron' => 352, | 262 | | 'scaron' => 353, | 263 | | 'sdot' => 8901, | 264 | | 'sect' => 167, | 265 | | 'shy' => 173, | 266 | | 'Sigma' => 931, | 267 | | 'sigma' => 963, | 268 | | 'sigmaf' => 962, | 269 | | 'sim' => 8764, | 270 | | 'spades' => 9824, | 271 | | 'sub' => 8834, | 272 | | 'sube' => 8838, | 273 | | 'sum' => 8721, | 274 | | 'sup' => 8835, | 275 | | 'sup1' => 185, | 276 | | 'sup2' => 178, | 277 | | 'sup3' => 179, | 278 | | 'supe' => 8839, | 279 | | 'szlig' => 223, | 280 | | 'Tau' => 932, | 281 | | 'tau' => 964, | 282 | | 'there4' => 8756, | 283 | | 'Theta' => 920, | 284 | | 'theta' => 952, | 285 | | 'thetasym' => 977, | 286 | | 'thinsp' => 8201, | 287 | | 'THORN' => 222, | 288 | | 'thorn' => 254, | 289 | | 'tilde' => 732, | 290 | | 'times' => 215, | 291 | | 'trade' => 8482, | 292 | | 'Uacute' => 218, | 293 | | 'uacute' => 250, | 294 | | 'uarr' => 8593, | 295 | | 'uArr' => 8657, | 296 | | 'Ucirc' => 219, | 297 | | 'ucirc' => 251, | 298 | | 'Ugrave' => 217, | 299 | | 'ugrave' => 249, | 300 | | 'uml' => 168, | 301 | | 'upsih' => 978, | 302 | | 'Upsilon' => 933, | 303 | | 'upsilon' => 965, | 304 | | 'Uuml' => 220, | 305 | | 'uuml' => 252, | 306 | | 'weierp' => 8472, | 307 | | 'Xi' => 926, | 308 | | 'xi' => 958, | 309 | | 'Yacute' => 221, | 310 | | 'yacute' => 253, | 311 | | 'yen' => 165, | 312 | | 'Yuml' => 376, | 313 | | 'yuml' => 255, | 314 | | 'Zeta' => 918, | 315 | | 'zeta' => 950, | 316 | | 'zwj' => 8205, | 317 | | 'zwnj' => 8204 ); | 318 | |
| 319 | | /** @package MediaWiki */ | 320 | | class Sanitizer { | 321 | | /** | 322 | | * Cleans up HTML, removes dangerous tags and attributes, and | 323 | | * removes HTML comments | 324 | | * @private | 325 | | * @param string $text | 326 | | * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values | 327 | | * @param array $args for the processing callback | 328 | | * @return string | 329 | | */ | 330 | | function removeHTMLtags( $text, $processCallback = null, $args = array() ) { | 331 | 1 | global $wgUseTidy, $wgUserHtml; | 332 | 1 | $fname = 'Parser::removeHTMLtags'; | 333 | 1 | wfProfileIn( $fname ); | 334 | |
| 335 | 1 | if( $wgUserHtml ) { | 336 | | $htmlpairs = array( # Tags that must be closed | 337 | 1 | 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', | 338 | | 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', | 339 | | 'strike', 'strong', 'tt', 'var', 'div', 'center', | 340 | | 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', | 341 | | 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' | 342 | | ); | 343 | | $htmlsingle = array( | 344 | 1 | 'br', 'hr', 'li', 'dt', 'dd' | 345 | | ); | 346 | | $htmlsingleonly = array( # Elements that cannot have close tags | 347 | 1 | 'br', 'hr' | 348 | | ); | 349 | | $htmlnest = array( # Tags that can be nested--?? | 350 | 1 | 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', | 351 | | 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' | 352 | | ); | 353 | | $tabletags = array( # Can only appear inside table | 354 | 1 | 'td', 'th', 'tr', | 355 | | ); | 356 | | $htmllist = array( # Tags used by list | 357 | 1 | 'ul','ol', | 358 | | ); | 359 | | $listtags = array( # Tags that can appear in a list | 360 | 1 | 'li', | 361 | | ); | 362 | |
| 363 | | } else { | 364 | | $htmlpairs = array(); | 365 | | $htmlsingle = array(); | 366 | | $htmlnest = array(); | 367 | | $tabletags = array(); | 368 | | } | 369 | |
| 370 | 1 | $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); | 371 | 1 | $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); | 372 | |
| 373 | | # Remove HTML comments | 374 | 1 | $text = Sanitizer::removeHTMLcomments( $text ); | 375 | 1 | $bits = explode( '<', $text ); | 376 | 1 | $text = array_shift( $bits ); | 377 | 1 | if(!$wgUseTidy) { | 378 | | $tagstack = array(); $tablestack = array(); | 379 | 1 | foreach ( $bits as $x ) { | 380 | 1 | $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); | 381 | 1 | preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', | 382 | 1 | $x, $regs ); | 383 | 1 | list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; | 384 | 1 | error_reporting( $prev ); | 385 | |
| 386 | 1 | $badtag = 0 ; | 387 | 1 | if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { | 388 | | # Check our stack | 389 | 1 | if ( $slash ) { | 390 | | # Closing a tag... | 391 | 1 | if( in_array( $t, $htmlsingleonly ) ) { | 392 | | $badtag = 1; | 393 | 1 | } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { | 394 | 1 | if ( in_array($ot, $htmlsingleallowed) ) { | 395 | | # Pop all elements with an optional close tag | 396 | | # and see if we find a match below them | 397 | | $optstack = array(); | 398 | 1 | array_push ($optstack, $ot); | 399 | 1 | while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && | 400 | 1 | in_array($ot, $htmlsingleallowed) ) { | 401 | | array_push ($optstack, $ot); | 402 | | } | 403 | 1 | if ( $t != $ot ) { | 404 | | # No match. Push the optinal elements back again | 405 | | $badtag = 1; | 406 | | while ( $ot = @array_pop( $optstack ) ) { | 407 | | array_push( $tagstack, $ot ); | 408 | | } | 409 | | } | 410 | | } else { | 411 | 1 | @array_push( $tagstack, $ot ); | 412 | | # <li> can be nested in <ul> or <ol>, skip those cases: | 413 | 1 | if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) { | 414 | 1 | $badtag = 1; | 415 | | } | 416 | | } | 417 | | } else { | 418 | 1 | if ( $t == 'table' ) { | 419 | 1 | $tagstack = array_pop( $tablestack ); | 420 | | } | 421 | | } | 422 | 1 | $newparams = ''; | 423 | | } else { | 424 | | # Keep track for later | 425 | 1 | if ( in_array( $t, $tabletags ) && | 426 | 1 | ! in_array( 'table', $tagstack ) ) { | 427 | | $badtag = 1; | 428 | 1 | } else if ( in_array( $t, $tagstack ) && | 429 | 1 | ! in_array ( $t , $htmlnest ) ) { | 430 | | $badtag = 1 ; | 431 | | # Is it a self closed htmlpair ? (bug 5487) | 432 | 1 | } else if( $brace == '/>' && | 433 | 1 | in_array($t, $htmlpairs) ) { | 434 | 1 | $badtag = 1; | 435 | 1 | } elseif( in_array( $t, $htmlsingleonly ) ) { | 436 | | # Hack to force empty tag for uncloseable elements | 437 | 1 | $brace = '/>'; | 438 | 1 | } else if( in_array( $t, $htmlsingle ) ) { | 439 | | # Hack to not close $htmlsingle tags | 440 | 1 | $brace = NULL; | 441 | | } else { | 442 | 1 | if ( $t == 'table' ) { | 443 | 1 | array_push( $tablestack, $tagstack ); | 444 | | $tagstack = array(); | 445 | | } | 446 | 1 | array_push( $tagstack, $t ); | 447 | | } | 448 | |
| 449 | | # Replace any variables or template parameters with | 450 | | # plaintext results. | 451 | 1 | if( is_callable( $processCallback ) ) { | 452 | 1 | call_user_func_array( $processCallback, array( &$params, $args ) ); | 453 | | } | 454 | |
| 455 | | # Strip non-approved attributes from the tag | 456 | 1 | $newparams = Sanitizer::fixTagAttributes( $params, $t ); | 457 | | } | 458 | 1 | if ( ! $badtag ) { | 459 | 1 | $rest = str_replace( '>', '>', $rest ); | 460 | 1 | $close = ( $brace == '/>' ) ? ' /' : ''; | 461 | 1 | $text .= "<$slash$t$newparams$close>$rest"; | 462 | 1 | continue; | 463 | | } | 464 | | } | 465 | 1 | $text .= '<' . str_replace( '>', '>', $x); | 466 | | } | 467 | | # Close off any remaining tags | 468 | 1 | while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { | 469 | 1 | $text .= "</$t>\n"; | 470 | 1 | if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } | 471 | | } | 472 | | } else { | 473 | | # this might be possible using tidy itself | 474 | | foreach ( $bits as $x ) { | 475 | | preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', | 476 | | $x, $regs ); | 477 | | @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; | 478 | | if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { | 479 | | if( is_callable( $processCallback ) ) { | 480 | | call_user_func_array( $processCallback, array( &$params, $args ) ); | 481 | | } | 482 | | $newparams = Sanitizer::fixTagAttributes( $params, $t ); | 483 | | $rest = str_replace( '>', '>', $rest ); | 484 | | $text .= "<$slash$t$newparams$brace$rest"; | 485 | | } else { | 486 | | $text .= '<' . str_replace( '>', '>', $x); | 487 | | } | 488 | | } | 489 | | } | 490 | 1 | wfProfileOut( $fname ); | 491 | 1 | return $text; | 492 | | } | 493 | |
| 494 | | /** | 495 | | * Remove '<!--', '-->', and everything between. | 496 | | * To avoid leaving blank lines, when a comment is both preceded | 497 | | * and followed by a newline (ignoring spaces), trim leading and | 498 | | * trailing spaces and one of the newlines. | 499 | | * | 500 | | * @private | 501 | | * @param string $text | 502 | | * @return string | 503 | | */ | 504 | | function removeHTMLcomments( $text ) { | 505 | 1 | $fname='Parser::removeHTMLcomments'; | 506 | 1 | wfProfileIn( $fname ); | 507 | 1 | while (($start = strpos($text, '<!--')) !== false) { | 508 | 1 | $end = strpos($text, '-->', $start + 4); | 509 | 1 | if ($end === false) { | 510 | | # Unterminated comment; bail out | 511 | | break; | 512 | | } | 513 | |
| 514 | 1 | $end += 3; | 515 | |
| 516 | | # Trim space and newline if the comment is both | 517 | | # preceded and followed by a newline | 518 | 1 | $spaceStart = max($start - 1, 0); | 519 | 1 | $spaceLen = $end - $spaceStart; | 520 | 1 | while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { | 521 | 1 | $spaceStart--; | 522 | 1 | $spaceLen++; | 523 | | } | 524 | 1 | while (substr($text, $spaceStart + $spaceLen, 1) === ' ') | 525 | 1 | $spaceLen++; | 526 | 1 | if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { | 527 | | # Remove the comment, leading and trailing | 528 | | # spaces, and leave only one newline. | 529 | 1 | $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); | 530 | | } | 531 | | else { | 532 | | # Remove just the comment. | 533 | 1 | $text = substr_replace($text, '', $start, $end - $start); | 534 | | } | 535 | | } | 536 | 1 | wfProfileOut( $fname ); | 537 | 1 | return $text; | 538 | | } | 539 | |
| 540 | | /** | 541 | | * Take an array of attribute names and values and normalize or discard | 542 | | * illegal values for the given element type. | 543 | | * | 544 | | * - Discards attributes not on a whitelist for the given element | 545 | | * - Unsafe style attributes are discarded | 546 | | * | 547 | | * @param array $attribs | 548 | | * @param string $element | 549 | | * @return array | 550 | | * | 551 | | * @todo Check for legal values where the DTD limits things. | 552 | | * @todo Check for unique id attribute :P | 553 | | */ | 554 | | function validateTagAttributes( $attribs, $element ) { | 555 | 1 | $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) ); | 556 | | $out = array(); | 557 | 1 | foreach( $attribs as $attribute => $value ) { | 558 | 1 | if( !isset( $whitelist[$attribute] ) ) { | 559 | 1 | continue; | 560 | | } | 561 | | # Strip javascript "expression" from stylesheets. | 562 | | # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp | 563 | 1 | if( $attribute == 'style' ) { | 564 | 1 | $value = Sanitizer::checkCss( $value ); | 565 | 1 | if( $value === false ) { | 566 | | # haxx0r | 567 | 1 | continue; | 568 | | } | 569 | | } | 570 | |
| 571 | 1 | if ( $attribute === 'id' ) | 572 | 1 | $value = Sanitizer::escapeId( $value ); | 573 | |
| 574 | | // If this attribute was previously set, override it. | 575 | | // Output should only have one attribute of each name. | 576 | 1 | $out[$attribute] = $value; | 577 | | } | 578 | 1 | return $out; | 579 | | } | 580 | | | 581 | | /** | 582 | | * Pick apart some CSS and check it for forbidden or unsafe structures. | 583 | | * Returns a sanitized string, or false if it was just too evil. | 584 | | * | 585 | | * Currently URL references, 'expression', 'tps' are forbidden. | 586 | | * | 587 | | * @param string $value | 588 | | * @return mixed | 589 | | */ | 590 | | static function checkCss( $value ) { | 591 | 1 | $stripped = Sanitizer::decodeCharReferences( $value ); | 592 | |
| 593 | | // Remove any comments; IE gets token splitting wrong | 594 | 1 | $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped ); | 595 | 1 | $value = $stripped; | 596 | |
| 597 | | // ... and continue checks | 598 | 1 | $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', | 599 | 1 | 'codepointToUtf8(hexdec("$1"))', $stripped ); | 600 | 1 | $stripped = str_replace( '\\', '', $stripped ); | 601 | 1 | if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', | 602 | 1 | $stripped ) ) { | 603 | | # haxx0r | 604 | 1 | return false; | 605 | | } | 606 | | | 607 | 1 | return $value; | 608 | | } | 609 | |
| 610 | | /** | 611 | | * Take a tag soup fragment listing an HTML element's attributes | 612 | | * and normalize it to well-formed XML, discarding unwanted attributes. | 613 | | * Output is safe for further wikitext processing, with escaping of | 614 | | * values that could trigger problems. | 615 | | * | 616 | | * - Normalizes attribute names to lowercase | 617 | | * - Discards attributes not on a whitelist for the given element | 618 | | * - Turns broken or invalid entities into plaintext | 619 | | * - Double-quotes all attribute values | 620 | | * - Attributes without values are given the name as attribute | 621 | | * - Double attributes are discarded | 622 | | * - Unsafe style attributes are discarded | 623 | | * - Prepends space if there are attributes. | 624 | | * | 625 | | * @param string $text | 626 | | * @param string $element | 627 | | * @return string | 628 | | */ | 629 | | function fixTagAttributes( $text, $element ) { | 630 | 1 | if( trim( $text ) == '' ) { | 631 | 1 | return ''; | 632 | | } | 633 | | | 634 | 1 | $stripped = Sanitizer::validateTagAttributes( | 635 | 1 | Sanitizer::decodeTagAttributes( $text ), $element ); | 636 | | | 637 | | $attribs = array(); | 638 | 1 | foreach( $stripped as $attribute => $value ) { | 639 | 1 | $encAttribute = htmlspecialchars( $attribute ); | 640 | 1 | $encValue = Sanitizer::safeEncodeAttribute( $value ); | 641 | | | 642 | 1 | $attribs[] = "$encAttribute=\"$encValue\""; | 643 | | } | 644 | 1 | return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; | 645 | | } | 646 | |
| 647 | | /** | 648 | | * Encode an attribute value for HTML output. | 649 | | * @param $text | 650 | | * @return HTML-encoded text fragment | 651 | | */ | 652 | | function encodeAttribute( $text ) { | 653 | 1 | $encValue = htmlspecialchars( $text ); | 654 | | | 655 | | // Whitespace is normalized during attribute decoding, | 656 | | // so if we've been passed non-spaces we must encode them | 657 | | // ahead of time or they won't be preserved. | 658 | 1 | $encValue = strtr( $encValue, array( | 659 | | "\n" => ' ', | 660 | | "\r" => ' ', | 661 | | "\t" => '	', | 662 | 1 | ) ); | 663 | | | 664 | 1 | return $encValue; | 665 | | } | 666 | | | 667 | | /** | 668 | | * Encode an attribute value for HTML tags, with extra armoring | 669 | | * against further wiki processing. | 670 | | * @param $text | 671 | | * @return HTML-encoded text fragment | 672 | | */ | 673 | | function safeEncodeAttribute( $text ) { | 674 | 1 | $encValue = Sanitizer::encodeAttribute( $text ); | 675 | | | 676 | | # Templates and links may be expanded in later parsing, | 677 | | # creating invalid or dangerous output. Suppress this. | 678 | 1 | $encValue = strtr( $encValue, array( | 679 | | '<' => '<', // This should never happen, | 680 | | '>' => '>', // we've received invalid input | 681 | | '"' => '"', // which should have been escaped. | 682 | | '{' => '{', | 683 | | '[' => '[', | 684 | | "''" => '''', | 685 | | 'ISBN' => 'ISBN', | 686 | | 'RFC' => 'RFC', | 687 | | 'PMID' => 'PMID', | 688 | | '|' => '|', | 689 | | '__' => '__', | 690 | 1 | ) ); | 691 | |
| 692 | | # Stupid hack | 693 | 1 | $encValue = preg_replace_callback( | 694 | 1 | '/(' . wfUrlProtocols() . ')/', | 695 | | array( 'Sanitizer', 'armorLinksCallback' ), | 696 | 1 | $encValue ); | 697 | 1 | return $encValue; | 698 | | } | 699 | |
| 700 | | /** | 701 | | * Given a value escape it so that it can be used in an id attribute and | 702 | | * return it, this does not validate the value however (see first link) | 703 | | * | 704 | | * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters | 705 | | * in the id and | 706 | | * name attributes | 707 | | * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute | 708 | | * | 709 | | * @bug 4461 | 710 | | * | 711 | | * @static | 712 | | * | 713 | | * @param string $id | 714 | | * @return string | 715 | | */ | 716 | | function escapeId( $id ) { | 717 | | static $replace = array( | 718 | | '%3A' => ':', | 719 | | '%' => '.' | 720 | 1 | ); | 721 | |
| 722 | 1 | $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); | 723 | |
| 724 | 1 | return str_replace( array_keys( $replace ), array_values( $replace ), $id ); | 725 | | } | 726 | |
| 727 | | /** | 728 | | * Regex replace callback for armoring links against further processing. | 729 | | * @param array $matches | 730 | | * @return string | 731 | | * @private | 732 | | */ | 733 | | function armorLinksCallback( $matches ) { | 734 | 1 | return str_replace( ':', ':', $matches[1] ); | 735 | | } | 736 | |
| 737 | | /** | 738 | | * Return an associative array of attribute names and values from | 739 | | * a partial tag string. Attribute names are forces to lowercase, | 740 | | * character references are decoded to UTF-8 text. | 741 | | * | 742 | | * @param string | 743 | | * @return array | 744 | | */ | 745 | | function decodeTagAttributes( $text ) { | 746 | | $attribs = array(); | 747 | |
| 748 | 1 | if( trim( $text ) == '' ) { | 749 | 1 | return $attribs; | 750 | | } | 751 | |
| 752 | | $pairs = array(); | 753 | 1 | if( !preg_match_all( | 754 | | MW_ATTRIBS_REGEX, | 755 | | $text, | 756 | | $pairs, | 757 | 1 | PREG_SET_ORDER ) ) { | 758 | 1 | return $attribs; | 759 | | } | 760 | |
| 761 | 1 | foreach( $pairs as $set ) { | 762 | 1 | $attribute = strtolower( $set[1] ); | 763 | 1 | $value = Sanitizer::getTagAttributeCallback( $set ); | 764 | | | 765 | | // Normalize whitespace | 766 | 1 | $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); | 767 | 1 | $value = trim( $value ); | 768 | | | 769 | | // Decode character references | 770 | 1 | $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); | 771 | | } | 772 | 1 | return $attribs; | 773 | | } | 774 | |
| 775 | | /** | 776 | | * Pick the appropriate attribute value from a match set from the | 777 | | * MW_ATTRIBS_REGEX matches. | 778 | | * | 779 | | * @param array $set | 780 | | * @return string | 781 | | * @private | 782 | | */ | 783 | | function getTagAttributeCallback( $set ) { | 784 | 1 | if( isset( $set[6] ) ) { | 785 | | # Illegal #XXXXXX color with no quotes. | 786 | | return $set[6]; | 787 | 1 | } elseif( isset( $set[5] ) ) { | 788 | | # No quotes. | 789 | 1 | return $set[5]; | 790 | 1 | } elseif( isset( $set[4] ) ) { | 791 | | # Single-quoted | 792 | 1 | return $set[4]; | 793 | 1 | } elseif( isset( $set[3] ) ) { | 794 | | # Double-quoted | 795 | 1 | return $set[3]; | 796 | 1 | } elseif( !isset( $set[2] ) ) { | 797 | | # In XHTML, attributes must have a value. | 798 | | # For 'reduced' form, return explicitly the attribute name here. | 799 | 1 | return $set[1]; | 800 | | } else { | 801 | | throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); | 802 | | } | 803 | | } | 804 | |
| 805 | | /** | 806 | | * Normalize whitespace and character references in an XML source- | 807 | | * encoded text for an attribute value. | 808 | | * | 809 | | * See http://www.w3.org/TR/REC-xml/#AVNormalize for background, | 810 | | * but note that we're not returning the value, but are returning | 811 | | * XML source fragments that will be slapped into output. | 812 | | * | 813 | | * @param string $text | 814 | | * @return string | 815 | | * @private | 816 | | */ | 817 | | function normalizeAttributeValue( $text ) { | 818 | 1 | return str_replace( '"', '"', | 819 | | preg_replace( | 820 | | '/\r\n|[\x20\x0d\x0a\x09]/', | 821 | | ' ', | 822 | 1 | Sanitizer::normalizeCharReferences( $text ) ) ); | 823 | | } | 824 | |
| 825 | | /** | 826 | | * Ensure that any entities and character references are legal | 827 | | * for XML and XHTML specifically. Any stray bits will be | 828 | | * &-escaped to result in a valid text fragment. | 829 | | * | 830 | | * a. any named char refs must be known in XHTML | 831 | | * b. any numeric char refs must be legal chars, not invalid or forbidden | 832 | | * c. use &#x, not &#X | 833 | | * d. fix or reject non-valid attributes | 834 | | * | 835 | | * @param string $text | 836 | | * @return string | 837 | | * @private | 838 | | */ | 839 | | function normalizeCharReferences( $text ) { | 840 | 1 | return preg_replace_callback( | 841 | | MW_CHAR_REFS_REGEX, | 842 | | array( 'Sanitizer', 'normalizeCharReferencesCallback' ), | 843 | 1 | $text ); | 844 | | } | 845 | | /** | 846 | | * @param string $matches | 847 | | * @return string | 848 | | */ | 849 | | function normalizeCharReferencesCallback( $matches ) { | 850 | 1 | $ret = null; | 851 | 1 | if( $matches[1] != '' ) { | 852 | 1 | $ret = Sanitizer::normalizeEntity( $matches[1] ); | 853 | 1 | } elseif( $matches[2] != '' ) { | 854 | 1 | $ret = Sanitizer::decCharReference( $matches[2] ); | 855 | 1 | } elseif( $matches[3] != '' ) { | 856 | | $ret = Sanitizer::hexCharReference( $matches[3] ); | 857 | 1 | } elseif( $matches[4] != '' ) { | 858 | | $ret = Sanitizer::hexCharReference( $matches[4] ); | 859 | | } | 860 | 1 | if( is_null( $ret ) ) { | 861 | 1 | return htmlspecialchars( $matches[0] ); | 862 | | } else { | 863 | 1 | return $ret; | 864 | | } | 865 | | } | 866 | |
| 867 | | /** | 868 | | * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, | 869 | | * return the named entity reference as is. Otherwise, returns | 870 | | * HTML-escaped text of pseudo-entity source (eg &foo;) | 871 | | * | 872 | | * @param string $name | 873 | | * @return string | 874 | | */ | 875 | | function normalizeEntity( $name ) { | 876 | 1 | global $wgHtmlEntities; | 877 | 1 | if( isset( $wgHtmlEntities[$name] ) ) { | 878 | 1 | return "&$name;"; | 879 | | } else { | 880 | 1 | return "&$name;"; | 881 | | } | 882 | | } | 883 | |
| 884 | | function decCharReference( $codepoint ) { | 885 | 1 | $point = intval( $codepoint ); | 886 | 1 | if( Sanitizer::validateCodepoint( $point ) ) { | 887 | 1 | return sprintf( '&#%d;', $point ); | 888 | | } else { | 889 | | return null; | 890 | | } | 891 | | } | 892 | |
| 893 | | function hexCharReference( $codepoint ) { | 894 | | $point = hexdec( $codepoint ); | 895 | | if( Sanitizer::validateCodepoint( $point ) ) { | 896 | | return sprintf( '&#x%x;', $point ); | 897 | | } else { | 898 | | return null; | 899 | | } | 900 | | } | 901 | |
| 902 | | /** | 903 | | * Returns true if a given Unicode codepoint is a valid character in XML. | 904 | | * @param int $codepoint | 905 | | * @return bool | 906 | | */ | 907 | | function validateCodepoint( $codepoint ) { | 908 | 1 | return ($codepoint == 0x09) | 909 | 1 | || ($codepoint == 0x0a) | 910 | 1 | || ($codepoint == 0x0d) | 911 | 1 | || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) | 912 | | || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) | 913 | | || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); | 914 | | } | 915 | |
| 916 | | /** | 917 | | * Decode any character references, numeric or named entities, | 918 | | * in the text and return a UTF-8 string. | 919 | | * | 920 | | * @param string $text | 921 | | * @return string | 922 | | * @public | 923 | | */ | 924 | | function decodeCharReferences( $text ) { | 925 | 1 | return preg_replace_callback( | 926 | | MW_CHAR_REFS_REGEX, | 927 | | array( 'Sanitizer', 'decodeCharReferencesCallback' ), | 928 | 1 | $text ); | 929 | | } | 930 | |
| 931 | | /** | 932 | | * @param string $matches | 933 | | * @return string | 934 | | */ | 935 | | function decodeCharReferencesCallback( $matches ) { | 936 | 1 | if( $matches[1] != '' ) { | 937 | 1 | return Sanitizer::decodeEntity( $matches[1] ); | 938 | 1 | } elseif( $matches[2] != '' ) { | 939 | 1 | return Sanitizer::decodeChar( intval( $matches[2] ) ); | 940 | 1 | } elseif( $matches[3] != '' ) { | 941 | 1 | return Sanitizer::decodeChar( hexdec( $matches[3] ) ); | 942 | 1 | } elseif( $matches[4] != '' ) { | 943 | | return Sanitizer::decodeChar( hexdec( $matches[4] ) ); | 944 | | } | 945 | | # Last case should be an ampersand by itself | 946 | 1 | return $matches[0]; | 947 | | } | 948 | |
| 949 | | /** | 950 | | * Return UTF-8 string for a codepoint if that is a valid | 951 | | * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. | 952 | | * @param int $codepoint | 953 | | * @return string | 954 | | * @private | 955 | | */ | 956 | | function decodeChar( $codepoint ) { | 957 | 1 | if( Sanitizer::validateCodepoint( $codepoint ) ) { | 958 | 1 | return codepointToUtf8( $codepoint ); | 959 | | } else { | 960 | | return UTF8_REPLACEMENT; | 961 | | } | 962 | | } | 963 | |
| 964 | | /** | 965 | | * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, | 966 | | * return the UTF-8 encoding of that character. Otherwise, returns | 967 | | * pseudo-entity source (eg &foo;) | 968 | | * | 969 | | * @param string $name | 970 | | * @return string | 971 | | */ | 972 | | function decodeEntity( $name ) { | 973 | 1 | global $wgHtmlEntities; | 974 | 1 | if( isset( $wgHtmlEntities[$name] ) ) { | 975 | 1 | return codepointToUtf8( $wgHtmlEntities[$name] ); | 976 | | } else { | 977 | | return "&$name;"; | 978 | | } | 979 | | } | 980 | |
| 981 | | /** | 982 | | * Fetch the whitelist of acceptable attributes for a given | 983 | | * element name. | 984 | | * | 985 | | * @param string $element | 986 | | * @return array | 987 | | */ | 988 | | function attributeWhitelist( $element ) { | 989 | 1 | static $list; | 990 | 1 | if( !isset( $list ) ) { | 991 | 1 | $list = Sanitizer::setupAttributeWhitelist(); | 992 | | } | 993 | 1 | return isset( $list[$element] ) | 994 | 1 | ? $list[$element] | 995 | 1 | : array(); | 996 | | } | 997 | |
| 998 | | /** | 999 | | * @return array | 1000 | | */ | 1001 | | function setupAttributeWhitelist() { | 1002 | | $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); | 1003 | 1 | $block = array_merge( $common, array( 'align' ) ); | 1004 | | $tablealign = array( 'align', 'char', 'charoff', 'valign' ); | 1005 | | $tablecell = array( 'abbr', | 1006 | | 'axis', | 1007 | | 'headers', | 1008 | | 'scope', | 1009 | | 'rowspan', | 1010 | | 'colspan', | 1011 | | 'nowrap', # deprecated | 1012 | | 'width', # deprecated | 1013 | | 'height', # deprecated | 1014 | | 'bgcolor' # deprecated | 1015 | | ); | 1016 | |
| 1017 | | # Numbers refer to sections in HTML 4.01 standard describing the element. | 1018 | | # See: http://www.w3.org/TR/html4/ | 1019 | | $whitelist = array ( | 1020 | | # 7.5.4 | 1021 | 1 | 'div' => $block, | 1022 | | 'center' => $common, # deprecated | 1023 | | 'span' => $block, # ?? | 1024 | |
| 1025 | | # 7.5.5 | 1026 | | 'h1' => $block, | 1027 | | 'h2' => $block, | 1028 | | 'h3' => $block, | 1029 | | 'h4' => $block, | 1030 | | 'h5' => $block, | 1031 | | 'h6' => $block, | 1032 | |
| 1033 | | # 7.5.6 | 1034 | | # address | 1035 | |
| 1036 | | # 8.2.4 | 1037 | | # bdo | 1038 | |
| 1039 | | # 9.2.1 | 1040 | | 'em' => $common, | 1041 | | 'strong' => $common, | 1042 | | 'cite' => $common, | 1043 | | # dfn | 1044 | | 'code' => $common, | 1045 | | # samp | 1046 | | # kbd | 1047 | | 'var' => $common, | 1048 | | # abbr | 1049 | | # acronym | 1050 | |
| 1051 | | # 9.2.2 | 1052 | 1 | 'blockquote' => array_merge( $common, array( 'cite' ) ), | 1053 | | # q | 1054 | |
| 1055 | | # 9.2.3 | 1056 | | 'sub' => $common, | 1057 | | 'sup' => $common, | 1058 | |
| 1059 | | # 9.3.1 | 1060 | | 'p' => $block, | 1061 | |
| 1062 | | # 9.3.2 | 1063 | | 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), | 1064 | |
| 1065 | | # 9.3.4 | 1066 | 1 | 'pre' => array_merge( $common, array( 'width' ) ), | 1067 | |
| 1068 | | # 9.4 | 1069 | 1 | 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), | 1070 | 1 | 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), | 1071 | |
| 1072 | | # 10.2 | 1073 | 1 | 'ul' => array_merge( $common, array( 'type' ) ), | 1074 | 1 | 'ol' => array_merge( $common, array( 'type', 'start' ) ), | 1075 | 1 | 'li' => array_merge( $common, array( 'type', 'value' ) ), | 1076 | |
| 1077 | | # 10.3 | 1078 | | 'dl' => $common, | 1079 | | 'dd' => $common, | 1080 | | 'dt' => $common, | 1081 | |
| 1082 | | # 11.2.1 | 1083 | | 'table' => array_merge( $common, | 1084 | | array( 'summary', 'width', 'border', 'frame', | 1085 | | 'rules', 'cellspacing', 'cellpadding', | 1086 | | 'align', 'bgcolor', 'frame', 'rules', | 1087 | 1 | 'border' ) ), | 1088 | |
| 1089 | | # 11.2.2 | 1090 | 1 | 'caption' => array_merge( $common, array( 'align' ) ), | 1091 | |
| 1092 | | # 11.2.3 | 1093 | 1 | 'thead' => array_merge( $common, $tablealign ), | 1094 | 1 | 'tfoot' => array_merge( $common, $tablealign ), | 1095 | 1 | 'tbody' => array_merge( $common, $tablealign ), | 1096 | |
| 1097 | | # 11.2.4 | 1098 | 1 | 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), | 1099 | 1 | 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), | 1100 | |
| 1101 | | # 11.2.5 | 1102 | 1 | 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), | 1103 | |
| 1104 | | # 11.2.6 | 1105 | 1 | 'td' => array_merge( $common, $tablecell, $tablealign ), | 1106 | 1 | 'th' => array_merge( $common, $tablecell, $tablealign ), | 1107 | |
| 1108 | | # 15.2.1 | 1109 | | 'tt' => $common, | 1110 | | 'b' => $common, | 1111 | | 'i' => $common, | 1112 | | 'big' => $common, | 1113 | | 'small' => $common, | 1114 | | 'strike' => $common, | 1115 | | 's' => $common, | 1116 | | 'u' => $common, | 1117 | |
| 1118 | | # 15.2.2 | 1119 | 1 | 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), | 1120 | | # basefont | 1121 | |
| 1122 | | # 15.3 | 1123 | 1 | 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), | 1124 | |
| 1125 | | # XHTML Ruby annotation text module, simple ruby only. | 1126 | | # http://www.w3c.org/TR/ruby/ | 1127 | | 'ruby' => $common, | 1128 | | # rbc | 1129 | | # rtc | 1130 | | 'rb' => $common, | 1131 | | 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), | 1132 | | 'rp' => $common, | 1133 | | ); | 1134 | 1 | return $whitelist; | 1135 | | } | 1136 | |
| 1137 | | /** | 1138 | | * Take a fragment of (potentially invalid) HTML and return | 1139 | | * a version with any tags removed, encoded suitably for literal | 1140 | | * inclusion in an attribute value. | 1141 | | * | 1142 | | * @param string $text HTML fragment | 1143 | | * @return string | 1144 | | */ | 1145 | | function stripAllTags( $text ) { | 1146 | | # Actual <tags> | 1147 | 1 | $text = preg_replace( '/ < .*? > /x', '', $text ); | 1148 | |
| 1149 | | # Normalize &entities and whitespace | 1150 | 1 | $text = Sanitizer::normalizeAttributeValue( $text ); | 1151 | |
| 1152 | | # Will be placed into "double-quoted" attributes, | 1153 | | # make sure remaining bits are safe. | 1154 | 1 | $text = str_replace( | 1155 | | array('<', '>', '"'), | 1156 | | array('<', '>', '"'), | 1157 | 1 | $text ); | 1158 | |
| 1159 | 1 | return $text; | 1160 | | } | 1161 | |
| 1162 | | /** | 1163 | | * Hack up a private DOCTYPE with HTML's standard entity declarations. | 1164 | | * PHP 4 seemed to know these if you gave it an HTML doctype, but | 1165 | | * PHP 5.1 doesn't. | 1166 | | * | 1167 | | * Use for passing XHTML fragments to PHP's XML parsing functions | 1168 | | * | 1169 | | * @return string | 1170 | | * @static | 1171 | | */ | 1172 | | function hackDocType() { | 1173 | 1 | global $wgHtmlEntities; | 1174 | 1 | $out = "<!DOCTYPE html [\n"; | 1175 | 1 | foreach( $wgHtmlEntities as $entity => $codepoint ) { | 1176 | 1 | $out .= "<!ENTITY $entity \"&#$codepoint;\">"; | 1177 | | } | 1178 | 1 | $out .= "]>\n"; | 1179 | 1 | return $out; | 1180 | | } | 1181 | |
| 1182 | | } | 1183 | |
| 1184 | | ?> |
|