| Line # | Frequency | Source Line | | 1 | | <?php | | 2 | | /** | | 3 | | * XHTML sanitizer for MediaWiki | | 4 | | * | | 5 | | * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al | | 6 | | * http://www.mediawiki.org/ | | 7 | | * | | 8 | | * This program is free software; you can redistribute it and/or modify | | 9 | | * it under the terms of the GNU General Public License as published by | | 10 | | * the Free Software Foundation; either version 2 of the License, or | | 11 | | * (at your option) any later version. | | 12 | | * | | 13 | | * This program is distributed in the hope that it will be useful, | | 14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of | | 15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | | 16 | | * GNU General Public License for more details. | | 17 | | * | | 18 | | * You should have received a copy of the GNU General Public License along | | 19 | | * with this program; if not, write to the Free Software Foundation, Inc., | | 20 | | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | | 21 | | * http://www.gnu.org/copyleft/gpl.html | | 22 | | * | | 23 | | * @package MediaWiki | | 24 | | * @subpackage Parser | | 25 | | */ | | 26 | | | | 27 | | /** | | 28 | | * Regular expression to match various types of character references in | | 29 | | * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences | | 30 | | */ | | 31 | 1 | define( 'MW_CHAR_REFS_REGEX', | | 32 | | '/&([A-Za-z0-9]+); | | 33 | | |&\#([0-9]+); | | 34 | | |&\#x([0-9A-Za-z]+); | | 35 | | |&\#X([0-9A-Za-z]+); | | 36 | 1 | |(&)/x' ); | | 37 | | | | 38 | | /** | | 39 | | * Regular expression to match HTML/XML attribute pairs within a tag. | | 40 | | * Allows some... latitude. | | 41 | | * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes | | 42 | | */ | | 43 | 1 | $attrib = '[A-Za-z0-9]'; | | 44 | 1 | $space = '[\x09\x0a\x0d\x20]'; | | 45 | 1 | define( 'MW_ATTRIBS_REGEX', | | 46 | | "/(?:^|$space)($attrib+) | | 47 | | ($space*=$space* | | 48 | | (?: | | 49 | | # The attribute value: quoted or alone | | 50 | | \"([^<\"]*)\" | | 51 | | | '([^<']*)' | | 52 | | | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) | | 53 | | | (\#[0-9a-fA-F]+) # Technically wrong, but lots of | | 54 | | # colors are specified like this. | | 55 | | # We'll be normalizing it. | | 56 | | ) | | 57 | 1 | )?(?=$space|\$)/sx" ); | | 58 | | | | 59 | | /** | | 60 | | * List of all named character entities defined in HTML 4.01 | | 61 | | * http://www.w3.org/TR/html4/sgml/entities.html | | 62 | | * @private | | 63 | | */ | | 64 | 1 | global $wgHtmlEntities; | | 65 | | $wgHtmlEntities = array( | | 66 | 1 | 'Aacute' => 193, | | 67 | | 'aacute' => 225, | | 68 | | 'Acirc' => 194, | | 69 | | 'acirc' => 226, | | 70 | | 'acute' => 180, | | 71 | | 'AElig' => 198, | | 72 | | 'aelig' => 230, | | 73 | | 'Agrave' => 192, | | 74 | | 'agrave' => 224, | | 75 | | 'alefsym' => 8501, | | 76 | | 'Alpha' => 913, | | 77 | | 'alpha' => 945, | | 78 | | 'amp' => 38, | | 79 | | 'and' => 8743, | | 80 | | 'ang' => 8736, | | 81 | | 'Aring' => 197, | | 82 | | 'aring' => 229, | | 83 | | 'asymp' => 8776, | | 84 | | 'Atilde' => 195, | | 85 | | 'atilde' => 227, | | 86 | | 'Auml' => 196, | | 87 | | 'auml' => 228, | | 88 | | 'bdquo' => 8222, | | 89 | | 'Beta' => 914, | | 90 | | 'beta' => 946, | | 91 | | 'brvbar' => 166, | | 92 | | 'bull' => 8226, | | 93 | | 'cap' => 8745, | | 94 | | 'Ccedil' => 199, | | 95 | | 'ccedil' => 231, | | 96 | | 'cedil' => 184, | | 97 | | 'cent' => 162, | | 98 | | 'Chi' => 935, | | 99 | | 'chi' => 967, | | 100 | | 'circ' => 710, | | 101 | | 'clubs' => 9827, | | 102 | | 'cong' => 8773, | | 103 | | 'copy' => 169, | | 104 | | 'crarr' => 8629, | | 105 | | 'cup' => 8746, | | 106 | | 'curren' => 164, | | 107 | | 'dagger' => 8224, | | 108 | | 'Dagger' => 8225, | | 109 | | 'darr' => 8595, | | 110 | | 'dArr' => 8659, | | 111 | | 'deg' => 176, | | 112 | | 'Delta' => 916, | | 113 | | 'delta' => 948, | | 114 | | 'diams' => 9830, | | 115 | | 'divide' => 247, | | 116 | | 'Eacute' => 201, | | 117 | | 'eacute' => 233, | | 118 | | 'Ecirc' => 202, | | 119 | | 'ecirc' => 234, | | 120 | | 'Egrave' => 200, | | 121 | | 'egrave' => 232, | | 122 | | 'empty' => 8709, | | 123 | | 'emsp' => 8195, | | 124 | | 'ensp' => 8194, | | 125 | | 'Epsilon' => 917, | | 126 | | 'epsilon' => 949, | | 127 | | 'equiv' => 8801, | | 128 | | 'Eta' => 919, | | 129 | | 'eta' => 951, | | 130 | | 'ETH' => 208, | | 131 | | 'eth' => 240, | | 132 | | 'Euml' => 203, | | 133 | | 'euml' => 235, | | 134 | | 'euro' => 8364, | | 135 | | 'exist' => 8707, | | 136 | | 'fnof' => 402, | | 137 | | 'forall' => 8704, | | 138 | | 'frac12' => 189, | | 139 | | 'frac14' => 188, | | 140 | | 'frac34' => 190, | | 141 | | 'frasl' => 8260, | | 142 | | 'Gamma' => 915, | | 143 | | 'gamma' => 947, | | 144 | | 'ge' => 8805, | | 145 | | 'gt' => 62, | | 146 | | 'harr' => 8596, | | 147 | | 'hArr' => 8660, | | 148 | | 'hearts' => 9829, | | 149 | | 'hellip' => 8230, | | 150 | | 'Iacute' => 205, | | 151 | | 'iacute' => 237, | | 152 | | 'Icirc' => 206, | | 153 | | 'icirc' => 238, | | 154 | | 'iexcl' => 161, | | 155 | | 'Igrave' => 204, | | 156 | | 'igrave' => 236, | | 157 | | 'image' => 8465, | | 158 | | 'infin' => 8734, | | 159 | | 'int' => 8747, | | 160 | | 'Iota' => 921, | | 161 | | 'iota' => 953, | | 162 | | 'iquest' => 191, | | 163 | | 'isin' => 8712, | | 164 | | 'Iuml' => 207, | | 165 | | 'iuml' => 239, | | 166 | | 'Kappa' => 922, | | 167 | | 'kappa' => 954, | | 168 | | 'Lambda' => 923, | | 169 | | 'lambda' => 955, | | 170 | | 'lang' => 9001, | | 171 | | 'laquo' => 171, | | 172 | | 'larr' => 8592, | | 173 | | 'lArr' => 8656, | | 174 | | 'lceil' => 8968, | | 175 | | 'ldquo' => 8220, | | 176 | | 'le' => 8804, | | 177 | | 'lfloor' => 8970, | | 178 | | 'lowast' => 8727, | | 179 | | 'loz' => 9674, | | 180 | | 'lrm' => 8206, | | 181 | | 'lsaquo' => 8249, | | 182 | | 'lsquo' => 8216, | | 183 | | 'lt' => 60, | | 184 | | 'macr' => 175, | | 185 | | 'mdash' => 8212, | | 186 | | 'micro' => 181, | | 187 | | 'middot' => 183, | | 188 | | 'minus' => 8722, | | 189 | | 'Mu' => 924, | | 190 | | 'mu' => 956, | | 191 | | 'nabla' => 8711, | | 192 | | 'nbsp' => 160, | | 193 | | 'ndash' => 8211, | | 194 | | 'ne' => 8800, | | 195 | | 'ni' => 8715, | | 196 | | 'not' => 172, | | 197 | | 'notin' => 8713, | | 198 | | 'nsub' => 8836, | | 199 | | 'Ntilde' => 209, | | 200 | | 'ntilde' => 241, | | 201 | | 'Nu' => 925, | | 202 | | 'nu' => 957, | | 203 | | 'Oacute' => 211, | | 204 | | 'oacute' => 243, | | 205 | | 'Ocirc' => 212, | | 206 | | 'ocirc' => 244, | | 207 | | 'OElig' => 338, | | 208 | | 'oelig' => 339, | | 209 | | 'Ograve' => 210, | | 210 | | 'ograve' => 242, | | 211 | | 'oline' => 8254, | | 212 | | 'Omega' => 937, | | 213 | | 'omega' => 969, | | 214 | | 'Omicron' => 927, | | 215 | | 'omicron' => 959, | | 216 | | 'oplus' => 8853, | | 217 | | 'or' => 8744, | | 218 | | 'ordf' => 170, | | 219 | | 'ordm' => 186, | | 220 | | 'Oslash' => 216, | | 221 | | 'oslash' => 248, | | 222 | | 'Otilde' => 213, | | 223 | | 'otilde' => 245, | | 224 | | 'otimes' => 8855, | | 225 | | 'Ouml' => 214, | | 226 | | 'ouml' => 246, | | 227 | | 'para' => 182, | | 228 | | 'part' => 8706, | | 229 | | 'permil' => 8240, | | 230 | | 'perp' => 8869, | | 231 | | 'Phi' => 934, | | 232 | | 'phi' => 966, | | 233 | | 'Pi' => 928, | | 234 | | 'pi' => 960, | | 235 | | 'piv' => 982, | | 236 | | 'plusmn' => 177, | | 237 | | 'pound' => 163, | | 238 | | 'prime' => 8242, | | 239 | | 'Prime' => 8243, | | 240 | | 'prod' => 8719, | | 241 | | 'prop' => 8733, | | 242 | | 'Psi' => 936, | | 243 | | 'psi' => 968, | | 244 | | 'quot' => 34, | | 245 | | 'radic' => 8730, | | 246 | | 'rang' => 9002, | | 247 | | 'raquo' => 187, | | 248 | | 'rarr' => 8594, | | 249 | | 'rArr' => 8658, | | 250 | | 'rceil' => 8969, | | 251 | | 'rdquo' => 8221, | | 252 | | 'real' => 8476, | | 253 | | 'reg' => 174, | | 254 | | 'rfloor' => 8971, | | 255 | | 'Rho' => 929, | | 256 | | 'rho' => 961, | | 257 | | 'rlm' => 8207, | | 258 | | 'rsaquo' => 8250, | | 259 | | 'rsquo' => 8217, | | 260 | | 'sbquo' => 8218, | | 261 | | 'Scaron' => 352, | | 262 | | 'scaron' => 353, | | 263 | | 'sdot' => 8901, | | 264 | | 'sect' => 167, | | 265 | | 'shy' => 173, | | 266 | | 'Sigma' => 931, | | 267 | | 'sigma' => 963, | | 268 | | 'sigmaf' => 962, | | 269 | | 'sim' => 8764, | | 270 | | 'spades' => 9824, | | 271 | | 'sub' => 8834, | | 272 | | 'sube' => 8838, | | 273 | | 'sum' => 8721, | | 274 | | 'sup' => 8835, | | 275 | | 'sup1' => 185, | | 276 | | 'sup2' => 178, | | 277 | | 'sup3' => 179, | | 278 | | 'supe' => 8839, | | 279 | | 'szlig' => 223, | | 280 | | 'Tau' => 932, | | 281 | | 'tau' => 964, | | 282 | | 'there4' => 8756, | | 283 | | 'Theta' => 920, | | 284 | | 'theta' => 952, | | 285 | | 'thetasym' => 977, | | 286 | | 'thinsp' => 8201, | | 287 | | 'THORN' => 222, | | 288 | | 'thorn' => 254, | | 289 | | 'tilde' => 732, | | 290 | | 'times' => 215, | | 291 | | 'trade' => 8482, | | 292 | | 'Uacute' => 218, | | 293 | | 'uacute' => 250, | | 294 | | 'uarr' => 8593, | | 295 | | 'uArr' => 8657, | | 296 | | 'Ucirc' => 219, | | 297 | | 'ucirc' => 251, | | 298 | | 'Ugrave' => 217, | | 299 | | 'ugrave' => 249, | | 300 | | 'uml' => 168, | | 301 | | 'upsih' => 978, | | 302 | | 'Upsilon' => 933, | | 303 | | 'upsilon' => 965, | | 304 | | 'Uuml' => 220, | | 305 | | 'uuml' => 252, | | 306 | | 'weierp' => 8472, | | 307 | | 'Xi' => 926, | | 308 | | 'xi' => 958, | | 309 | | 'Yacute' => 221, | | 310 | | 'yacute' => 253, | | 311 | | 'yen' => 165, | | 312 | | 'Yuml' => 376, | | 313 | | 'yuml' => 255, | | 314 | | 'Zeta' => 918, | | 315 | | 'zeta' => 950, | | 316 | | 'zwj' => 8205, | | 317 | | 'zwnj' => 8204 ); | | 318 | | | | 319 | | /** @package MediaWiki */ | | 320 | | class Sanitizer { | | 321 | | /** | | 322 | | * Cleans up HTML, removes dangerous tags and attributes, and | | 323 | | * removes HTML comments | | 324 | | * @private | | 325 | | * @param string $text | | 326 | | * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values | | 327 | | * @param array $args for the processing callback | | 328 | | * @return string | | 329 | | */ | | 330 | | function removeHTMLtags( $text, $processCallback = null, $args = array() ) { | | 331 | 1 | global $wgUseTidy, $wgUserHtml; | | 332 | 1 | $fname = 'Parser::removeHTMLtags'; | | 333 | 1 | wfProfileIn( $fname ); | | 334 | | | | 335 | 1 | if( $wgUserHtml ) { | | 336 | | $htmlpairs = array( # Tags that must be closed | | 337 | 1 | 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', | | 338 | | 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', | | 339 | | 'strike', 'strong', 'tt', 'var', 'div', 'center', | | 340 | | 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', | | 341 | | 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' | | 342 | | ); | | 343 | | $htmlsingle = array( | | 344 | 1 | 'br', 'hr', 'li', 'dt', 'dd' | | 345 | | ); | | 346 | | $htmlsingleonly = array( # Elements that cannot have close tags | | 347 | 1 | 'br', 'hr' | | 348 | | ); | | 349 | | $htmlnest = array( # Tags that can be nested--?? | | 350 | 1 | 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', | | 351 | | 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' | | 352 | | ); | | 353 | | $tabletags = array( # Can only appear inside table | | 354 | 1 | 'td', 'th', 'tr', | | 355 | | ); | | 356 | | $htmllist = array( # Tags used by list | | 357 | 1 | 'ul','ol', | | 358 | | ); | | 359 | | $listtags = array( # Tags that can appear in a list | | 360 | 1 | 'li', | | 361 | | ); | | 362 | | | | 363 | | } else { | | 364 | | $htmlpairs = array(); | | 365 | | $htmlsingle = array(); | | 366 | | $htmlnest = array(); | | 367 | | $tabletags = array(); | | 368 | | } | | 369 | | | | 370 | 1 | $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); | | 371 | 1 | $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); | | 372 | | | | 373 | | # Remove HTML comments | | 374 | 1 | $text = Sanitizer::removeHTMLcomments( $text ); | | 375 | 1 | $bits = explode( '<', $text ); | | 376 | 1 | $text = array_shift( $bits ); | | 377 | 1 | if(!$wgUseTidy) { | | 378 | | $tagstack = array(); $tablestack = array(); | | 379 | 1 | foreach ( $bits as $x ) { | | 380 | 1 | $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); | | 381 | 1 | preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', | | 382 | 1 | $x, $regs ); | | 383 | 1 | list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; | | 384 | 1 | error_reporting( $prev ); | | 385 | | | | 386 | 1 | $badtag = 0 ; | | 387 | 1 | if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { | | 388 | | # Check our stack | | 389 | 1 | if ( $slash ) { | | 390 | | # Closing a tag... | | 391 | 1 | if( in_array( $t, $htmlsingleonly ) ) { | | 392 | | $badtag = 1; | | 393 | 1 | } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { | | 394 | 1 | if ( in_array($ot, $htmlsingleallowed) ) { | | 395 | | # Pop all elements with an optional close tag | | 396 | | # and see if we find a match below them | | 397 | | $optstack = array(); | | 398 | 1 | array_push ($optstack, $ot); | | 399 | 1 | while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && | | 400 | 1 | in_array($ot, $htmlsingleallowed) ) { | | 401 | | array_push ($optstack, $ot); | | 402 | | } | | 403 | 1 | if ( $t != $ot ) { | | 404 | | # No match. Push the optinal elements back again | | 405 | | $badtag = 1; | | 406 | | while ( $ot = @array_pop( $optstack ) ) { | | 407 | | array_push( $tagstack, $ot ); | | 408 | | } | | 409 | | } | | 410 | | } else { | | 411 | 1 | @array_push( $tagstack, $ot ); | | 412 | | # <li> can be nested in <ul> or <ol>, skip those cases: | | 413 | 1 | if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) { | | 414 | 1 | $badtag = 1; | | 415 | | } | | 416 | | } | | 417 | | } else { | | 418 | 1 | if ( $t == 'table' ) { | | 419 | 1 | $tagstack = array_pop( $tablestack ); | | 420 | | } | | 421 | | } | | 422 | 1 | $newparams = ''; | | 423 | | } else { | | 424 | | # Keep track for later | | 425 | 1 | if ( in_array( $t, $tabletags ) && | | 426 | 1 | ! in_array( 'table', $tagstack ) ) { | | 427 | | $badtag = 1; | | 428 | 1 | } else if ( in_array( $t, $tagstack ) && | | 429 | 1 | ! in_array ( $t , $htmlnest ) ) { | | 430 | | $badtag = 1 ; | | 431 | | # Is it a self closed htmlpair ? (bug 5487) | | 432 | 1 | } else if( $brace == '/>' && | | 433 | 1 | in_array($t, $htmlpairs) ) { | | 434 | 1 | $badtag = 1; | | 435 | 1 | } elseif( in_array( $t, $htmlsingleonly ) ) { | | 436 | | # Hack to force empty tag for uncloseable elements | | 437 | 1 | $brace = '/>'; | | 438 | 1 | } else if( in_array( $t, $htmlsingle ) ) { | | 439 | | # Hack to not close $htmlsingle tags | | 440 | 1 | $brace = NULL; | | 441 | | } else { | | 442 | 1 | if ( $t == 'table' ) { | | 443 | 1 | array_push( $tablestack, $tagstack ); | | 444 | | $tagstack = array(); | | 445 | | } | | 446 | 1 | array_push( $tagstack, $t ); | | 447 | | } | | 448 | | | | 449 | | # Replace any variables or template parameters with | | 450 | | # plaintext results. | | 451 | 1 | if( is_callable( $processCallback ) ) { | | 452 | 1 | call_user_func_array( $processCallback, array( &$params, $args ) ); | | 453 | | } | | 454 | | | | 455 | | # Strip non-approved attributes from the tag | | 456 | 1 | $newparams = Sanitizer::fixTagAttributes( $params, $t ); | | 457 | | } | | 458 | 1 | if ( ! $badtag ) { | | 459 | 1 | $rest = str_replace( '>', '>', $rest ); | | 460 | 1 | $close = ( $brace == '/>' ) ? ' /' : ''; | | 461 | 1 | $text .= "<$slash$t$newparams$close>$rest"; | | 462 | 1 | continue; | | 463 | | } | | 464 | | } | | 465 | 1 | $text .= '<' . str_replace( '>', '>', $x); | | 466 | | } | | 467 | | # Close off any remaining tags | | 468 | 1 | while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { | | 469 | 1 | $text .= "</$t>\n"; | | 470 | 1 | if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } | | 471 | | } | | 472 | | } else { | | 473 | | # this might be possible using tidy itself | | 474 | | foreach ( $bits as $x ) { | | 475 | | preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', | | 476 | | $x, $regs ); | | 477 | | @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; | | 478 | | if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { | | 479 | | if( is_callable( $processCallback ) ) { | | 480 | | call_user_func_array( $processCallback, array( &$params, $args ) ); | | 481 | | } | | 482 | | $newparams = Sanitizer::fixTagAttributes( $params, $t ); | | 483 | | $rest = str_replace( '>', '>', $rest ); | | 484 | | $text .= "<$slash$t$newparams$brace$rest"; | | 485 | | } else { | | 486 | | $text .= '<' . str_replace( '>', '>', $x); | | 487 | | } | | 488 | | } | | 489 | | } | | 490 | 1 | wfProfileOut( $fname ); | | 491 | 1 | return $text; | | 492 | | } | | 493 | | | | 494 | | /** | | 495 | | * Remove '<!--', '-->', and everything between. | | 496 | | * To avoid leaving blank lines, when a comment is both preceded | | 497 | | * and followed by a newline (ignoring spaces), trim leading and | | 498 | | * trailing spaces and one of the newlines. | | 499 | | * | | 500 | | * @private | | 501 | | * @param string $text | | 502 | | * @return string | | 503 | | */ | | 504 | | function removeHTMLcomments( $text ) { | | 505 | 1 | $fname='Parser::removeHTMLcomments'; | | 506 | 1 | wfProfileIn( $fname ); | | 507 | 1 | while (($start = strpos($text, '<!--')) !== false) { | | 508 | 1 | $end = strpos($text, '-->', $start + 4); | | 509 | 1 | if ($end === false) { | | 510 | | # Unterminated comment; bail out | | 511 | | break; | | 512 | | } | | 513 | | | | 514 | 1 | $end += 3; | | 515 | | | | 516 | | # Trim space and newline if the comment is both | | 517 | | # preceded and followed by a newline | | 518 | 1 | $spaceStart = max($start - 1, 0); | | 519 | 1 | $spaceLen = $end - $spaceStart; | | 520 | 1 | while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { | | 521 | 1 | $spaceStart--; | | 522 | 1 | $spaceLen++; | | 523 | | } | | 524 | 1 | while (substr($text, $spaceStart + $spaceLen, 1) === ' ') | | 525 | 1 | $spaceLen++; | | 526 | 1 | if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { | | 527 | | # Remove the comment, leading and trailing | | 528 | | # spaces, and leave only one newline. | | 529 | 1 | $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); | | 530 | | } | | 531 | | else { | | 532 | | # Remove just the comment. | | 533 | 1 | $text = substr_replace($text, '', $start, $end - $start); | | 534 | | } | | 535 | | } | | 536 | 1 | wfProfileOut( $fname ); | | 537 | 1 | return $text; | | 538 | | } | | 539 | | | | 540 | | /** | | 541 | | * Take an array of attribute names and values and normalize or discard | | 542 | | * illegal values for the given element type. | | 543 | | * | | 544 | | * - Discards attributes not on a whitelist for the given element | | 545 | | * - Unsafe style attributes are discarded | | 546 | | * | | 547 | | * @param array $attribs | | 548 | | * @param string $element | | 549 | | * @return array | | 550 | | * | | 551 | | * @todo Check for legal values where the DTD limits things. | | 552 | | * @todo Check for unique id attribute :P | | 553 | | */ | | 554 | | function validateTagAttributes( $attribs, $element ) { | | 555 | 1 | $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) ); | | 556 | | $out = array(); | | 557 | 1 | foreach( $attribs as $attribute => $value ) { | | 558 | 1 | if( !isset( $whitelist[$attribute] ) ) { | | 559 | 1 | continue; | | 560 | | } | | 561 | | # Strip javascript "expression" from stylesheets. | | 562 | | # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp | | 563 | 1 | if( $attribute == 'style' ) { | | 564 | 1 | $value = Sanitizer::checkCss( $value ); | | 565 | 1 | if( $value === false ) { | | 566 | | # haxx0r | | 567 | 1 | continue; | | 568 | | } | | 569 | | } | | 570 | | | | 571 | 1 | if ( $attribute === 'id' ) | | 572 | 1 | $value = Sanitizer::escapeId( $value ); | | 573 | | | | 574 | | // If this attribute was previously set, override it. | | 575 | | // Output should only have one attribute of each name. | | 576 | 1 | $out[$attribute] = $value; | | 577 | | } | | 578 | 1 | return $out; | | 579 | | } | | 580 | | | | 581 | | /** | | 582 | | * Pick apart some CSS and check it for forbidden or unsafe structures. | | 583 | | * Returns a sanitized string, or false if it was just too evil. | | 584 | | * | | 585 | | * Currently URL references, 'expression', 'tps' are forbidden. | | 586 | | * | | 587 | | * @param string $value | | 588 | | * @return mixed | | 589 | | */ | | 590 | | static function checkCss( $value ) { | | 591 | 1 | $stripped = Sanitizer::decodeCharReferences( $value ); | | 592 | | | | 593 | | // Remove any comments; IE gets token splitting wrong | | 594 | 1 | $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped ); | | 595 | 1 | $value = $stripped; | | 596 | | | | 597 | | // ... and continue checks | | 598 | 1 | $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e', | | 599 | 1 | 'codepointToUtf8(hexdec("$1"))', $stripped ); | | 600 | 1 | $stripped = str_replace( '\\', '', $stripped ); | | 601 | 1 | if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is', | | 602 | 1 | $stripped ) ) { | | 603 | | # haxx0r | | 604 | 1 | return false; | | 605 | | } | | 606 | | | | 607 | 1 | return $value; | | 608 | | } | | 609 | | | | 610 | | /** | | 611 | | * Take a tag soup fragment listing an HTML element's attributes | | 612 | | * and normalize it to well-formed XML, discarding unwanted attributes. | | 613 | | * Output is safe for further wikitext processing, with escaping of | | 614 | | * values that could trigger problems. | | 615 | | * | | 616 | | * - Normalizes attribute names to lowercase | | 617 | | * - Discards attributes not on a whitelist for the given element | | 618 | | * - Turns broken or invalid entities into plaintext | | 619 | | * - Double-quotes all attribute values | | 620 | | * - Attributes without values are given the name as attribute | | 621 | | * - Double attributes are discarded | | 622 | | * - Unsafe style attributes are discarded | | 623 | | * - Prepends space if there are attributes. | | 624 | | * | | 625 | | * @param string $text | | 626 | | * @param string $element | | 627 | | * @return string | | 628 | | */ | | 629 | | function fixTagAttributes( $text, $element ) { | | 630 | 1 | if( trim( $text ) == '' ) { | | 631 | 1 | return ''; | | 632 | | } | | 633 | | | | 634 | 1 | $stripped = Sanitizer::validateTagAttributes( | | 635 | 1 | Sanitizer::decodeTagAttributes( $text ), $element ); | | 636 | | | | 637 | | $attribs = array(); | | 638 | 1 | foreach( $stripped as $attribute => $value ) { | | 639 | 1 | $encAttribute = htmlspecialchars( $attribute ); | | 640 | 1 | $encValue = Sanitizer::safeEncodeAttribute( $value ); | | 641 | | | | 642 | 1 | $attribs[] = "$encAttribute=\"$encValue\""; | | 643 | | } | | 644 | 1 | return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; | | 645 | | } | | 646 | | | | 647 | | /** | | 648 | | * Encode an attribute value for HTML output. | | 649 | | * @param $text | | 650 | | * @return HTML-encoded text fragment | | 651 | | */ | | 652 | | function encodeAttribute( $text ) { | | 653 | 1 | $encValue = htmlspecialchars( $text ); | | 654 | | | | 655 | | // Whitespace is normalized during attribute decoding, | | 656 | | // so if we've been passed non-spaces we must encode them | | 657 | | // ahead of time or they won't be preserved. | | 658 | 1 | $encValue = strtr( $encValue, array( | | 659 | | "\n" => ' ', | | 660 | | "\r" => ' ', | | 661 | | "\t" => '	', | | 662 | 1 | ) ); | | 663 | | | | 664 | 1 | return $encValue; | | 665 | | } | | 666 | | | | 667 | | /** | | 668 | | * Encode an attribute value for HTML tags, with extra armoring | | 669 | | * against further wiki processing. | | 670 | | * @param $text | | 671 | | * @return HTML-encoded text fragment | | 672 | | */ | | 673 | | function safeEncodeAttribute( $text ) { | | 674 | 1 | $encValue = Sanitizer::encodeAttribute( $text ); | | 675 | | | | 676 | | # Templates and links may be expanded in later parsing, | | 677 | | # creating invalid or dangerous output. Suppress this. | | 678 | 1 | $encValue = strtr( $encValue, array( | | 679 | | '<' => '<', // This should never happen, | | 680 | | '>' => '>', // we've received invalid input | | 681 | | '"' => '"', // which should have been escaped. | | 682 | | '{' => '{', | | 683 | | '[' => '[', | | 684 | | "''" => '''', | | 685 | | 'ISBN' => 'ISBN', | | 686 | | 'RFC' => 'RFC', | | 687 | | 'PMID' => 'PMID', | | 688 | | '|' => '|', | | 689 | | '__' => '__', | | 690 | 1 | ) ); | | 691 | | | | 692 | | # Stupid hack | | 693 | 1 | $encValue = preg_replace_callback( | | 694 | 1 | '/(' . wfUrlProtocols() . ')/', | | 695 | | array( 'Sanitizer', 'armorLinksCallback' ), | | 696 | 1 | $encValue ); | | 697 | 1 | return $encValue; | | 698 | | } | | 699 | | | | 700 | | /** | | 701 | | * Given a value escape it so that it can be used in an id attribute and | | 702 | | * return it, this does not validate the value however (see first link) | | 703 | | * | | 704 | | * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters | | 705 | | * in the id and | | 706 | | * name attributes | | 707 | | * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute | | 708 | | * | | 709 | | * @bug 4461 | | 710 | | * | | 711 | | * @static | | 712 | | * | | 713 | | * @param string $id | | 714 | | * @return string | | 715 | | */ | | 716 | | function escapeId( $id ) { | | 717 | | static $replace = array( | | 718 | | '%3A' => ':', | | 719 | | '%' => '.' | | 720 | 1 | ); | | 721 | | | | 722 | 1 | $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); | | 723 | | | | 724 | 1 | return str_replace( array_keys( $replace ), array_values( $replace ), $id ); | | 725 | | } | | 726 | | | | 727 | | /** | | 728 | | * Regex replace callback for armoring links against further processing. | | 729 | | * @param array $matches | | 730 | | * @return string | | 731 | | * @private | | 732 | | */ | | 733 | | function armorLinksCallback( $matches ) { | | 734 | 1 | return str_replace( ':', ':', $matches[1] ); | | 735 | | } | | 736 | | | | 737 | | /** | | 738 | | * Return an associative array of attribute names and values from | | 739 | | * a partial tag string. Attribute names are forces to lowercase, | | 740 | | * character references are decoded to UTF-8 text. | | 741 | | * | | 742 | | * @param string | | 743 | | * @return array | | 744 | | */ | | 745 | | function decodeTagAttributes( $text ) { | | 746 | | $attribs = array(); | | 747 | | | | 748 | 1 | if( trim( $text ) == '' ) { | | 749 | 1 | return $attribs; | | 750 | | } | | 751 | | | | 752 | | $pairs = array(); | | 753 | 1 | if( !preg_match_all( | | 754 | | MW_ATTRIBS_REGEX, | | 755 | | $text, | | 756 | | $pairs, | | 757 | 1 | PREG_SET_ORDER ) ) { | | 758 | 1 | return $attribs; | | 759 | | } | | 760 | | | | 761 | 1 | foreach( $pairs as $set ) { | | 762 | 1 | $attribute = strtolower( $set[1] ); | | 763 | 1 | $value = Sanitizer::getTagAttributeCallback( $set ); | | 764 | | | | 765 | | // Normalize whitespace | | 766 | 1 | $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); | | 767 | 1 | $value = trim( $value ); | | 768 | | | | 769 | | // Decode character references | | 770 | 1 | $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); | | 771 | | } | | 772 | 1 | return $attribs; | | 773 | | } | | 774 | | | | 775 | | /** | | 776 | | * Pick the appropriate attribute value from a match set from the | | 777 | | * MW_ATTRIBS_REGEX matches. | | 778 | | * | | 779 | | * @param array $set | | 780 | | * @return string | | 781 | | * @private | | 782 | | */ | | 783 | | function getTagAttributeCallback( $set ) { | | 784 | 1 | if( isset( $set[6] ) ) { | | 785 | | # Illegal #XXXXXX color with no quotes. | | 786 | | return $set[6]; | | 787 | 1 | } elseif( isset( $set[5] ) ) { | | 788 | | # No quotes. | | 789 | 1 | return $set[5]; | | 790 | 1 | } elseif( isset( $set[4] ) ) { | | 791 | | # Single-quoted | | 792 | 1 | return $set[4]; | | 793 | 1 | } elseif( isset( $set[3] ) ) { | | 794 | | # Double-quoted | | 795 | 1 | return $set[3]; | | 796 | 1 | } elseif( !isset( $set[2] ) ) { | | 797 | | # In XHTML, attributes must have a value. | | 798 | | # For 'reduced' form, return explicitly the attribute name here. | | 799 | 1 | return $set[1]; | | 800 | | } else { | | 801 | | throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); | | 802 | | } | | 803 | | } | | 804 | | | | 805 | | /** | | 806 | | * Normalize whitespace and character references in an XML source- | | 807 | | * encoded text for an attribute value. | | 808 | | * | | 809 | | * See http://www.w3.org/TR/REC-xml/#AVNormalize for background, | | 810 | | * but note that we're not returning the value, but are returning | | 811 | | * XML source fragments that will be slapped into output. | | 812 | | * | | 813 | | * @param string $text | | 814 | | * @return string | | 815 | | * @private | | 816 | | */ | | 817 | | function normalizeAttributeValue( $text ) { | | 818 | 1 | return str_replace( '"', '"', | | 819 | | preg_replace( | | 820 | | '/\r\n|[\x20\x0d\x0a\x09]/', | | 821 | | ' ', | | 822 | 1 | Sanitizer::normalizeCharReferences( $text ) ) ); | | 823 | | } | | 824 | | | | 825 | | /** | | 826 | | * Ensure that any entities and character references are legal | | 827 | | * for XML and XHTML specifically. Any stray bits will be | | 828 | | * &-escaped to result in a valid text fragment. | | 829 | | * | | 830 | | * a. any named char refs must be known in XHTML | | 831 | | * b. any numeric char refs must be legal chars, not invalid or forbidden | | 832 | | * c. use &#x, not &#X | | 833 | | * d. fix or reject non-valid attributes | | 834 | | * | | 835 | | * @param string $text | | 836 | | * @return string | | 837 | | * @private | | 838 | | */ | | 839 | | function normalizeCharReferences( $text ) { | | 840 | 1 | return preg_replace_callback( | | 841 | | MW_CHAR_REFS_REGEX, | | 842 | | array( 'Sanitizer', 'normalizeCharReferencesCallback' ), | | 843 | 1 | $text ); | | 844 | | } | | 845 | | /** | | 846 | | * @param string $matches | | 847 | | * @return string | | 848 | | */ | | 849 | | function normalizeCharReferencesCallback( $matches ) { | | 850 | 1 | $ret = null; | | 851 | 1 | if( $matches[1] != '' ) { | | 852 | 1 | $ret = Sanitizer::normalizeEntity( $matches[1] ); | | 853 | 1 | } elseif( $matches[2] != '' ) { | | 854 | 1 | $ret = Sanitizer::decCharReference( $matches[2] ); | | 855 | 1 | } elseif( $matches[3] != '' ) { | | 856 | | $ret = Sanitizer::hexCharReference( $matches[3] ); | | 857 | 1 | } elseif( $matches[4] != '' ) { | | 858 | | $ret = Sanitizer::hexCharReference( $matches[4] ); | | 859 | | } | | 860 | 1 | if( is_null( $ret ) ) { | | 861 | 1 | return htmlspecialchars( $matches[0] ); | | 862 | | } else { | | 863 | 1 | return $ret; | | 864 | | } | | 865 | | } | | 866 | | | | 867 | | /** | | 868 | | * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, | | 869 | | * return the named entity reference as is. Otherwise, returns | | 870 | | * HTML-escaped text of pseudo-entity source (eg &foo;) | | 871 | | * | | 872 | | * @param string $name | | 873 | | * @return string | | 874 | | */ | | 875 | | function normalizeEntity( $name ) { | | 876 | 1 | global $wgHtmlEntities; | | 877 | 1 | if( isset( $wgHtmlEntities[$name] ) ) { | | 878 | 1 | return "&$name;"; | | 879 | | } else { | | 880 | 1 | return "&$name;"; | | 881 | | } | | 882 | | } | | 883 | | | | 884 | | function decCharReference( $codepoint ) { | | 885 | 1 | $point = intval( $codepoint ); | | 886 | 1 | if( Sanitizer::validateCodepoint( $point ) ) { | | 887 | 1 | return sprintf( '&#%d;', $point ); | | 888 | | } else { | | 889 | | return null; | | 890 | | } | | 891 | | } | | 892 | | | | 893 | | function hexCharReference( $codepoint ) { | | 894 | | $point = hexdec( $codepoint ); | | 895 | | if( Sanitizer::validateCodepoint( $point ) ) { | | 896 | | return sprintf( '&#x%x;', $point ); | | 897 | | } else { | | 898 | | return null; | | 899 | | } | | 900 | | } | | 901 | | | | 902 | | /** | | 903 | | * Returns true if a given Unicode codepoint is a valid character in XML. | | 904 | | * @param int $codepoint | | 905 | | * @return bool | | 906 | | */ | | 907 | | function validateCodepoint( $codepoint ) { | | 908 | 1 | return ($codepoint == 0x09) | | 909 | 1 | || ($codepoint == 0x0a) | | 910 | 1 | || ($codepoint == 0x0d) | | 911 | 1 | || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) | | 912 | | || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) | | 913 | | || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); | | 914 | | } | | 915 | | | | 916 | | /** | | 917 | | * Decode any character references, numeric or named entities, | | 918 | | * in the text and return a UTF-8 string. | | 919 | | * | | 920 | | * @param string $text | | 921 | | * @return string | | 922 | | * @public | | 923 | | */ | | 924 | | function decodeCharReferences( $text ) { | | 925 | 1 | return preg_replace_callback( | | 926 | | MW_CHAR_REFS_REGEX, | | 927 | | array( 'Sanitizer', 'decodeCharReferencesCallback' ), | | 928 | 1 | $text ); | | 929 | | } | | 930 | | | | 931 | | /** | | 932 | | * @param string $matches | | 933 | | * @return string | | 934 | | */ | | 935 | | function decodeCharReferencesCallback( $matches ) { | | 936 | 1 | if( $matches[1] != '' ) { | | 937 | 1 | return Sanitizer::decodeEntity( $matches[1] ); | | 938 | 1 | } elseif( $matches[2] != '' ) { | | 939 | 1 | return Sanitizer::decodeChar( intval( $matches[2] ) ); | | 940 | 1 | } elseif( $matches[3] != '' ) { | | 941 | 1 | return Sanitizer::decodeChar( hexdec( $matches[3] ) ); | | 942 | 1 | } elseif( $matches[4] != '' ) { | | 943 | | return Sanitizer::decodeChar( hexdec( $matches[4] ) ); | | 944 | | } | | 945 | | # Last case should be an ampersand by itself | | 946 | 1 | return $matches[0]; | | 947 | | } | | 948 | | | | 949 | | /** | | 950 | | * Return UTF-8 string for a codepoint if that is a valid | | 951 | | * character reference, otherwise U+FFFD REPLACEMENT CHARACTER. | | 952 | | * @param int $codepoint | | 953 | | * @return string | | 954 | | * @private | | 955 | | */ | | 956 | | function decodeChar( $codepoint ) { | | 957 | 1 | if( Sanitizer::validateCodepoint( $codepoint ) ) { | | 958 | 1 | return codepointToUtf8( $codepoint ); | | 959 | | } else { | | 960 | | return UTF8_REPLACEMENT; | | 961 | | } | | 962 | | } | | 963 | | | | 964 | | /** | | 965 | | * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, | | 966 | | * return the UTF-8 encoding of that character. Otherwise, returns | | 967 | | * pseudo-entity source (eg &foo;) | | 968 | | * | | 969 | | * @param string $name | | 970 | | * @return string | | 971 | | */ | | 972 | | function decodeEntity( $name ) { | | 973 | 1 | global $wgHtmlEntities; | | 974 | 1 | if( isset( $wgHtmlEntities[$name] ) ) { | | 975 | 1 | return codepointToUtf8( $wgHtmlEntities[$name] ); | | 976 | | } else { | | 977 | | return "&$name;"; | | 978 | | } | | 979 | | } | | 980 | | | | 981 | | /** | | 982 | | * Fetch the whitelist of acceptable attributes for a given | | 983 | | * element name. | | 984 | | * | | 985 | | * @param string $element | | 986 | | * @return array | | 987 | | */ | | 988 | | function attributeWhitelist( $element ) { | | 989 | 1 | static $list; | | 990 | 1 | if( !isset( $list ) ) { | | 991 | 1 | $list = Sanitizer::setupAttributeWhitelist(); | | 992 | | } | | 993 | 1 | return isset( $list[$element] ) | | 994 | 1 | ? $list[$element] | | 995 | 1 | : array(); | | 996 | | } | | 997 | | | | 998 | | /** | | 999 | | * @return array | | 1000 | | */ | | 1001 | | function setupAttributeWhitelist() { | | 1002 | | $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); | | 1003 | 1 | $block = array_merge( $common, array( 'align' ) ); | | 1004 | | $tablealign = array( 'align', 'char', 'charoff', 'valign' ); | | 1005 | | $tablecell = array( 'abbr', | | 1006 | | 'axis', | | 1007 | | 'headers', | | 1008 | | 'scope', | | 1009 | | 'rowspan', | | 1010 | | 'colspan', | | 1011 | | 'nowrap', # deprecated | | 1012 | | 'width', # deprecated | | 1013 | | 'height', # deprecated | | 1014 | | 'bgcolor' # deprecated | | 1015 | | ); | | 1016 | | | | 1017 | | # Numbers refer to sections in HTML 4.01 standard describing the element. | | 1018 | | # See: http://www.w3.org/TR/html4/ | | 1019 | | $whitelist = array ( | | 1020 | | # 7.5.4 | | 1021 | 1 | 'div' => $block, | | 1022 | | 'center' => $common, # deprecated | | 1023 | | 'span' => $block, # ?? | | 1024 | | | | 1025 | | # 7.5.5 | | 1026 | | 'h1' => $block, | | 1027 | | 'h2' => $block, | | 1028 | | 'h3' => $block, | | 1029 | | 'h4' => $block, | | 1030 | | 'h5' => $block, | | 1031 | | 'h6' => $block, | | 1032 | | | | 1033 | | # 7.5.6 | | 1034 | | # address | | 1035 | | | | 1036 | | # 8.2.4 | | 1037 | | # bdo | | 1038 | | | | 1039 | | # 9.2.1 | | 1040 | | 'em' => $common, | | 1041 | | 'strong' => $common, | | 1042 | | 'cite' => $common, | | 1043 | | # dfn | | 1044 | | 'code' => $common, | | 1045 | | # samp | | 1046 | | # kbd | | 1047 | | 'var' => $common, | | 1048 | | # abbr | | 1049 | | # acronym | | 1050 | | | | 1051 | | # 9.2.2 | | 1052 | 1 | 'blockquote' => array_merge( $common, array( 'cite' ) ), | | 1053 | | # q | | 1054 | | | | 1055 | | # 9.2.3 | | 1056 | | 'sub' => $common, | | 1057 | | 'sup' => $common, | | 1058 | | | | 1059 | | # 9.3.1 | | 1060 | | 'p' => $block, | | 1061 | | | | 1062 | | # 9.3.2 | | 1063 | | 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), | | 1064 | | | | 1065 | | # 9.3.4 | | 1066 | 1 | 'pre' => array_merge( $common, array( 'width' ) ), | | 1067 | | | | 1068 | | # 9.4 | | 1069 | 1 | 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), | | 1070 | 1 | 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), | | 1071 | | | | 1072 | | # 10.2 | | 1073 | 1 | 'ul' => array_merge( $common, array( 'type' ) ), | | 1074 | 1 | 'ol' => array_merge( $common, array( 'type', 'start' ) ), | | 1075 | 1 | 'li' => array_merge( $common, array( 'type', 'value' ) ), | | 1076 | | | | 1077 | | # 10.3 | | 1078 | | 'dl' => $common, | | 1079 | | 'dd' => $common, | | 1080 | | 'dt' => $common, | | 1081 | | | | 1082 | | # 11.2.1 | | 1083 | | 'table' => array_merge( $common, | | 1084 | | array( 'summary', 'width', 'border', 'frame', | | 1085 | | 'rules', 'cellspacing', 'cellpadding', | | 1086 | | 'align', 'bgcolor', 'frame', 'rules', | | 1087 | 1 | 'border' ) ), | | 1088 | | | | 1089 | | # 11.2.2 | | 1090 | 1 | 'caption' => array_merge( $common, array( 'align' ) ), | | 1091 | | | | 1092 | | # 11.2.3 | | 1093 | 1 | 'thead' => array_merge( $common, $tablealign ), | | 1094 | 1 | 'tfoot' => array_merge( $common, $tablealign ), | | 1095 | 1 | 'tbody' => array_merge( $common, $tablealign ), | | 1096 | | | | 1097 | | # 11.2.4 | | 1098 | 1 | 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), | | 1099 | 1 | 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), | | 1100 | | | | 1101 | | # 11.2.5 | | 1102 | 1 | 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), | | 1103 | | | | 1104 | | # 11.2.6 | | 1105 | 1 | 'td' => array_merge( $common, $tablecell, $tablealign ), | | 1106 | 1 | 'th' => array_merge( $common, $tablecell, $tablealign ), | | 1107 | | | | 1108 | | # 15.2.1 | | 1109 | | 'tt' => $common, | | 1110 | | 'b' => $common, | | 1111 | | 'i' => $common, | | 1112 | | 'big' => $common, | | 1113 | | 'small' => $common, | | 1114 | | 'strike' => $common, | | 1115 | | 's' => $common, | | 1116 | | 'u' => $common, | | 1117 | | | | 1118 | | # 15.2.2 | | 1119 | 1 | 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), | | 1120 | | # basefont | | 1121 | | | | 1122 | | # 15.3 | | 1123 | 1 | 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), | | 1124 | | | | 1125 | | # XHTML Ruby annotation text module, simple ruby only. | | 1126 | | # http://www.w3c.org/TR/ruby/ | | 1127 | | 'ruby' => $common, | | 1128 | | # rbc | | 1129 | | # rtc | | 1130 | | 'rb' => $common, | | 1131 | | 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), | | 1132 | | 'rp' => $common, | | 1133 | | ); | | 1134 | 1 | return $whitelist; | | 1135 | | } | | 1136 | | | | 1137 | | /** | | 1138 | | * Take a fragment of (potentially invalid) HTML and return | | 1139 | | * a version with any tags removed, encoded suitably for literal | | 1140 | | * inclusion in an attribute value. | | 1141 | | * | | 1142 | | * @param string $text HTML fragment | | 1143 | | * @return string | | 1144 | | */ | | 1145 | | function stripAllTags( $text ) { | | 1146 | | # Actual <tags> | | 1147 | 1 | $text = preg_replace( '/ < .*? > /x', '', $text ); | | 1148 | | | | 1149 | | # Normalize &entities and whitespace | | 1150 | 1 | $text = Sanitizer::normalizeAttributeValue( $text ); | | 1151 | | | | 1152 | | # Will be placed into "double-quoted" attributes, | | 1153 | | # make sure remaining bits are safe. | | 1154 | 1 | $text = str_replace( | | 1155 | | array('<', '>', '"'), | | 1156 | | array('<', '>', '"'), | | 1157 | 1 | $text ); | | 1158 | | | | 1159 | 1 | return $text; | | 1160 | | } | | 1161 | | | | 1162 | | /** | | 1163 | | * Hack up a private DOCTYPE with HTML's standard entity declarations. | | 1164 | | * PHP 4 seemed to know these if you gave it an HTML doctype, but | | 1165 | | * PHP 5.1 doesn't. | | 1166 | | * | | 1167 | | * Use for passing XHTML fragments to PHP's XML parsing functions | | 1168 | | * | | 1169 | | * @return string | | 1170 | | * @static | | 1171 | | */ | | 1172 | | function hackDocType() { | | 1173 | 1 | global $wgHtmlEntities; | | 1174 | 1 | $out = "<!DOCTYPE html [\n"; | | 1175 | 1 | foreach( $wgHtmlEntities as $entity => $codepoint ) { | | 1176 | 1 | $out .= "<!ENTITY $entity \"&#$codepoint;\">"; | | 1177 | | } | | 1178 | 1 | $out .= "]>\n"; | | 1179 | 1 | return $out; | | 1180 | | } | | 1181 | | | | 1182 | | } | | 1183 | | | | 1184 | | ?> |
|