Spike PHPCoverage Details: Sanitizer.php

Line #	Frequency	Source Line
1		`<?php`
2		`/**`
3		`* XHTML sanitizer for MediaWiki`
4		`*`
5		`* Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al`
6		`* http://www.mediawiki.org/`
7		`*`
8		`* This program is free software; you can redistribute it and/or modify`
9		`* it under the terms of the GNU General Public License as published by`
10		`* the Free Software Foundation; either version 2 of the License, or`
11		`* (at your option) any later version.`
12		`*`
13		`* This program is distributed in the hope that it will be useful,`
14		`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
15		`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
16		`* GNU General Public License for more details.`
17		`*`
18		`* You should have received a copy of the GNU General Public License along`
19		`* with this program; if not, write to the Free Software Foundation, Inc.,`
20		`* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.`
21		`* http://www.gnu.org/copyleft/gpl.html`
22		`*`
23		`* @package MediaWiki`
24		`* @subpackage Parser`
25		`*/`
26
27		`/**`
28		`* Regular expression to match various types of character references in`
29		`* Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences`
30		`*/`
31	1	`define( 'MW_CHAR_REFS_REGEX',`
32		`'/&([A-Za-z0-9]+);`
33		`\|&\#([0-9]+);`
34		`\|&\#x([0-9A-Za-z]+);`
35		`\|&\#X([0-9A-Za-z]+);`
36	1	`\|(&)/x' );`
37
38		`/**`
39		`* Regular expression to match HTML/XML attribute pairs within a tag.`
40		`* Allows some... latitude.`
41		`* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes`
42		`*/`
43	1	`$attrib = '[A-Za-z0-9]';`
44	1	`$space = '[\x09\x0a\x0d\x20]';`
45	1	`define( 'MW_ATTRIBS_REGEX',`
46		`"/(?:^\|$space)($attrib+)`
47		`($space=$space`
48		`(?:`
49		`# The attribute value: quoted or alone`
50		`\"([^<\"]*)\"`
51		`\| '([^<']*)'`
52		\| ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{\|}~]+)
53		`\| (\#[0-9a-fA-F]+) # Technically wrong, but lots of`
54		`# colors are specified like this.`
55		`# We'll be normalizing it.`
56		`)`
57	1	`)?(?=$space\|\$)/sx" );`
58
59		`/**`
60		`* List of all named character entities defined in HTML 4.01`
61		`* http://www.w3.org/TR/html4/sgml/entities.html`
62		`* @private`
63		`*/`
64	1	`global $wgHtmlEntities;`
65		`$wgHtmlEntities = array(`
66	1	`'Aacute' => 193,`
67		`'aacute' => 225,`
68		`'Acirc' => 194,`
69		`'acirc' => 226,`
70		`'acute' => 180,`
71		`'AElig' => 198,`
72		`'aelig' => 230,`
73		`'Agrave' => 192,`
74		`'agrave' => 224,`
75		`'alefsym' => 8501,`
76		`'Alpha' => 913,`
77		`'alpha' => 945,`
78		`'amp' => 38,`
79		`'and' => 8743,`
80		`'ang' => 8736,`
81		`'Aring' => 197,`
82		`'aring' => 229,`
83		`'asymp' => 8776,`
84		`'Atilde' => 195,`
85		`'atilde' => 227,`
86		`'Auml' => 196,`
87		`'auml' => 228,`
88		`'bdquo' => 8222,`
89		`'Beta' => 914,`
90		`'beta' => 946,`
91		`'brvbar' => 166,`
92		`'bull' => 8226,`
93		`'cap' => 8745,`
94		`'Ccedil' => 199,`
95		`'ccedil' => 231,`
96		`'cedil' => 184,`
97		`'cent' => 162,`
98		`'Chi' => 935,`
99		`'chi' => 967,`
100		`'circ' => 710,`
101		`'clubs' => 9827,`
102		`'cong' => 8773,`
103		`'copy' => 169,`
104		`'crarr' => 8629,`
105		`'cup' => 8746,`
106		`'curren' => 164,`
107		`'dagger' => 8224,`
108		`'Dagger' => 8225,`
109		`'darr' => 8595,`
110		`'dArr' => 8659,`
111		`'deg' => 176,`
112		`'Delta' => 916,`
113		`'delta' => 948,`
114		`'diams' => 9830,`
115		`'divide' => 247,`
116		`'Eacute' => 201,`
117		`'eacute' => 233,`
118		`'Ecirc' => 202,`
119		`'ecirc' => 234,`
120		`'Egrave' => 200,`
121		`'egrave' => 232,`
122		`'empty' => 8709,`
123		`'emsp' => 8195,`
124		`'ensp' => 8194,`
125		`'Epsilon' => 917,`
126		`'epsilon' => 949,`
127		`'equiv' => 8801,`
128		`'Eta' => 919,`
129		`'eta' => 951,`
130		`'ETH' => 208,`
131		`'eth' => 240,`
132		`'Euml' => 203,`
133		`'euml' => 235,`
134		`'euro' => 8364,`
135		`'exist' => 8707,`
136		`'fnof' => 402,`
137		`'forall' => 8704,`
138		`'frac12' => 189,`
139		`'frac14' => 188,`
140		`'frac34' => 190,`
141		`'frasl' => 8260,`
142		`'Gamma' => 915,`
143		`'gamma' => 947,`
144		`'ge' => 8805,`
145		`'gt' => 62,`
146		`'harr' => 8596,`
147		`'hArr' => 8660,`
148		`'hearts' => 9829,`
149		`'hellip' => 8230,`
150		`'Iacute' => 205,`
151		`'iacute' => 237,`
152		`'Icirc' => 206,`
153		`'icirc' => 238,`
154		`'iexcl' => 161,`
155		`'Igrave' => 204,`
156		`'igrave' => 236,`
157		`'image' => 8465,`
158		`'infin' => 8734,`
159		`'int' => 8747,`
160		`'Iota' => 921,`
161		`'iota' => 953,`
162		`'iquest' => 191,`
163		`'isin' => 8712,`
164		`'Iuml' => 207,`
165		`'iuml' => 239,`
166		`'Kappa' => 922,`
167		`'kappa' => 954,`
168		`'Lambda' => 923,`
169		`'lambda' => 955,`
170		`'lang' => 9001,`
171		`'laquo' => 171,`
172		`'larr' => 8592,`
173		`'lArr' => 8656,`
174		`'lceil' => 8968,`
175		`'ldquo' => 8220,`
176		`'le' => 8804,`
177		`'lfloor' => 8970,`
178		`'lowast' => 8727,`
179		`'loz' => 9674,`
180		`'lrm' => 8206,`
181		`'lsaquo' => 8249,`
182		`'lsquo' => 8216,`
183		`'lt' => 60,`
184		`'macr' => 175,`
185		`'mdash' => 8212,`
186		`'micro' => 181,`
187		`'middot' => 183,`
188		`'minus' => 8722,`
189		`'Mu' => 924,`
190		`'mu' => 956,`
191		`'nabla' => 8711,`
192		`'nbsp' => 160,`
193		`'ndash' => 8211,`
194		`'ne' => 8800,`
195		`'ni' => 8715,`
196		`'not' => 172,`
197		`'notin' => 8713,`
198		`'nsub' => 8836,`
199		`'Ntilde' => 209,`
200		`'ntilde' => 241,`
201		`'Nu' => 925,`
202		`'nu' => 957,`
203		`'Oacute' => 211,`
204		`'oacute' => 243,`
205		`'Ocirc' => 212,`
206		`'ocirc' => 244,`
207		`'OElig' => 338,`
208		`'oelig' => 339,`
209		`'Ograve' => 210,`
210		`'ograve' => 242,`
211		`'oline' => 8254,`
212		`'Omega' => 937,`
213		`'omega' => 969,`
214		`'Omicron' => 927,`
215		`'omicron' => 959,`
216		`'oplus' => 8853,`
217		`'or' => 8744,`
218		`'ordf' => 170,`
219		`'ordm' => 186,`
220		`'Oslash' => 216,`
221		`'oslash' => 248,`
222		`'Otilde' => 213,`
223		`'otilde' => 245,`
224		`'otimes' => 8855,`
225		`'Ouml' => 214,`
226		`'ouml' => 246,`
227		`'para' => 182,`
228		`'part' => 8706,`
229		`'permil' => 8240,`
230		`'perp' => 8869,`
231		`'Phi' => 934,`
232		`'phi' => 966,`
233		`'Pi' => 928,`
234		`'pi' => 960,`
235		`'piv' => 982,`
236		`'plusmn' => 177,`
237		`'pound' => 163,`
238		`'prime' => 8242,`
239		`'Prime' => 8243,`
240		`'prod' => 8719,`
241		`'prop' => 8733,`
242		`'Psi' => 936,`
243		`'psi' => 968,`
244		`'quot' => 34,`
245		`'radic' => 8730,`
246		`'rang' => 9002,`
247		`'raquo' => 187,`
248		`'rarr' => 8594,`
249		`'rArr' => 8658,`
250		`'rceil' => 8969,`
251		`'rdquo' => 8221,`
252		`'real' => 8476,`
253		`'reg' => 174,`
254		`'rfloor' => 8971,`
255		`'Rho' => 929,`
256		`'rho' => 961,`
257		`'rlm' => 8207,`
258		`'rsaquo' => 8250,`
259		`'rsquo' => 8217,`
260		`'sbquo' => 8218,`
261		`'Scaron' => 352,`
262		`'scaron' => 353,`
263		`'sdot' => 8901,`
264		`'sect' => 167,`
265		`'shy' => 173,`
266		`'Sigma' => 931,`
267		`'sigma' => 963,`
268		`'sigmaf' => 962,`
269		`'sim' => 8764,`
270		`'spades' => 9824,`
271		`'sub' => 8834,`
272		`'sube' => 8838,`
273		`'sum' => 8721,`
274		`'sup' => 8835,`
275		`'sup1' => 185,`
276		`'sup2' => 178,`
277		`'sup3' => 179,`
278		`'supe' => 8839,`
279		`'szlig' => 223,`
280		`'Tau' => 932,`
281		`'tau' => 964,`
282		`'there4' => 8756,`
283		`'Theta' => 920,`
284		`'theta' => 952,`
285		`'thetasym' => 977,`
286		`'thinsp' => 8201,`
287		`'THORN' => 222,`
288		`'thorn' => 254,`
289		`'tilde' => 732,`
290		`'times' => 215,`
291		`'trade' => 8482,`
292		`'Uacute' => 218,`
293		`'uacute' => 250,`
294		`'uarr' => 8593,`
295		`'uArr' => 8657,`
296		`'Ucirc' => 219,`
297		`'ucirc' => 251,`
298		`'Ugrave' => 217,`
299		`'ugrave' => 249,`
300		`'uml' => 168,`
301		`'upsih' => 978,`
302		`'Upsilon' => 933,`
303		`'upsilon' => 965,`
304		`'Uuml' => 220,`
305		`'uuml' => 252,`
306		`'weierp' => 8472,`
307		`'Xi' => 926,`
308		`'xi' => 958,`
309		`'Yacute' => 221,`
310		`'yacute' => 253,`
311		`'yen' => 165,`
312		`'Yuml' => 376,`
313		`'yuml' => 255,`
314		`'Zeta' => 918,`
315		`'zeta' => 950,`
316		`'zwj' => 8205,`
317		`'zwnj' => 8204 );`
318
319		`/** @package MediaWiki */`
320		`class Sanitizer {`
321		`/**`
322		`* Cleans up HTML, removes dangerous tags and attributes, and`
323		`* removes HTML comments`
324		`* @private`
325		`* @param string $text`
326		`* @param callback $processCallback to do any variable or parameter replacements in HTML attribute values`
327		`* @param array $args for the processing callback`
328		`* @return string`
329		`*/`
330		`function removeHTMLtags( $text, $processCallback = null, $args = array() ) {`
331	1	`global $wgUseTidy, $wgUserHtml;`
332	1	`$fname = 'Parser::removeHTMLtags';`
333	1	`wfProfileIn( $fname );`
334
335	1	`if( $wgUserHtml ) {`
336		`$htmlpairs = array( # Tags that must be closed`
337	1	`'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',`
338		`'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',`
339		`'strike', 'strong', 'tt', 'var', 'div', 'center',`
340		`'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',`
341		`'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'`
342		`);`
343		`$htmlsingle = array(`
344	1	`'br', 'hr', 'li', 'dt', 'dd'`
345		`);`
346		`$htmlsingleonly = array( # Elements that cannot have close tags`
347	1	`'br', 'hr'`
348		`);`
349		`$htmlnest = array( # Tags that can be nested--??`
350	1	`'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',`
351		`'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'`
352		`);`
353		`$tabletags = array( # Can only appear inside table`
354	1	`'td', 'th', 'tr',`
355		`);`
356		`$htmllist = array( # Tags used by list`
357	1	`'ul','ol',`
358		`);`
359		`$listtags = array( # Tags that can appear in a list`
360	1	`'li',`
361		`);`
362
363		`} else {`
364		`$htmlpairs = array();`
365		`$htmlsingle = array();`
366		`$htmlnest = array();`
367		`$tabletags = array();`
368		`}`
369
370	1	`$htmlsingleallowed = array_merge( $htmlsingle, $tabletags );`
371	1	`$htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );`
372
373		`# Remove HTML comments`
374	1	`$text = Sanitizer::removeHTMLcomments( $text );`
375	1	`$bits = explode( '<', $text );`
376	1	`$text = array_shift( $bits );`
377	1	`if(!$wgUseTidy) {`
378		`$tagstack = array(); $tablestack = array();`
379	1	`foreach ( $bits as $x ) {`
380	1	`$prev = error_reporting( E_ALL & ~( E_NOTICE \| E_WARNING ) );`
381	1	`preg_match( '/^(\\/?)(\\w+)([^>]?)(\\/{0,1}>)([^<])$/',`
382	1	`$x, $regs );`
383	1	`list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;`
384	1	`error_reporting( $prev );`
385
386	1	`$badtag = 0 ;`
387	1	`if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {`
388		`# Check our stack`
389	1	`if ( $slash ) {`
390		`# Closing a tag...`
391	1	`if( in_array( $t, $htmlsingleonly ) ) {`
392		`$badtag = 1;`
393	1	`} elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {`
394	1	`if ( in_array($ot, $htmlsingleallowed) ) {`
395		`# Pop all elements with an optional close tag`
396		`# and see if we find a match below them`
397		`$optstack = array();`
398	1	`array_push ($optstack, $ot);`
399	1	`while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&`
400	1	`in_array($ot, $htmlsingleallowed) ) {`
401		`array_push ($optstack, $ot);`
402		`}`
403	1	`if ( $t != $ot ) {`
404		`# No match. Push the optinal elements back again`
405		`$badtag = 1;`
406		`while ( $ot = @array_pop( $optstack ) ) {`
407		`array_push( $tagstack, $ot );`
408		`}`
409		`}`
410		`} else {`
411	1	`@array_push( $tagstack, $ot );`
412		`# <li> can be nested in <ul> or <ol>, skip those cases:`
413	1	`if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {`
414	1	`$badtag = 1;`
415		`}`
416		`}`
417		`} else {`
418	1	`if ( $t == 'table' ) {`
419	1	`$tagstack = array_pop( $tablestack );`
420		`}`
421		`}`
422	1	`$newparams = '';`
423		`} else {`
424		`# Keep track for later`
425	1	`if ( in_array( $t, $tabletags ) &&`
426	1	`! in_array( 'table', $tagstack ) ) {`
427		`$badtag = 1;`
428	1	`} else if ( in_array( $t, $tagstack ) &&`
429	1	`! in_array ( $t , $htmlnest ) ) {`
430		`$badtag = 1 ;`
431		`# Is it a self closed htmlpair ? (bug 5487)`
432	1	`} else if( $brace == '/>' &&`
433	1	`in_array($t, $htmlpairs) ) {`
434	1	`$badtag = 1;`
435	1	`} elseif( in_array( $t, $htmlsingleonly ) ) {`
436		`# Hack to force empty tag for uncloseable elements`
437	1	`$brace = '/>';`
438	1	`} else if( in_array( $t, $htmlsingle ) ) {`
439		`# Hack to not close $htmlsingle tags`
440	1	`$brace = NULL;`
441		`} else {`
442	1	`if ( $t == 'table' ) {`
443	1	`array_push( $tablestack, $tagstack );`
444		`$tagstack = array();`
445		`}`
446	1	`array_push( $tagstack, $t );`
447		`}`
448
449		`# Replace any variables or template parameters with`
450		`# plaintext results.`
451	1	`if( is_callable( $processCallback ) ) {`
452	1	`call_user_func_array( $processCallback, array( &$params, $args ) );`
453		`}`
454
455		`# Strip non-approved attributes from the tag`
456	1	`$newparams = Sanitizer::fixTagAttributes( $params, $t );`
457		`}`
458	1	`if ( ! $badtag ) {`
459	1	`$rest = str_replace( '>', '>', $rest );`
460	1	`$close = ( $brace == '/>' ) ? ' /' : '';`
461	1	`$text .= "<$slash$t$newparams$close>$rest";`
462	1	`continue;`
463		`}`
464		`}`
465	1	`$text .= '<' . str_replace( '>', '>', $x);`
466		`}`
467		`# Close off any remaining tags`
468	1	`while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {`
469	1	`$text .= "</$t>\n";`
470	1	`if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }`
471		`}`
472		`} else {`
473		`# this might be possible using tidy itself`
474		`foreach ( $bits as $x ) {`
475		`preg_match( '/^(\\/?)(\\w+)([^>]?)(\\/{0,1}>)([^<])$/',`
476		`$x, $regs );`
477		`@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;`
478		`if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {`
479		`if( is_callable( $processCallback ) ) {`
480		`call_user_func_array( $processCallback, array( &$params, $args ) );`
481		`}`
482		`$newparams = Sanitizer::fixTagAttributes( $params, $t );`
483		`$rest = str_replace( '>', '>', $rest );`
484		`$text .= "<$slash$t$newparams$brace$rest";`
485		`} else {`
486		`$text .= '<' . str_replace( '>', '>', $x);`
487		`}`
488		`}`
489		`}`
490	1	`wfProfileOut( $fname );`
491	1	`return $text;`
492		`}`
493
494		`/**`
495		`* Remove '<!--', '-->', and everything between.`
496		`* To avoid leaving blank lines, when a comment is both preceded`
497		`* and followed by a newline (ignoring spaces), trim leading and`
498		`* trailing spaces and one of the newlines.`
499		`*`
500		`* @private`
501		`* @param string $text`
502		`* @return string`
503		`*/`
504		`function removeHTMLcomments( $text ) {`
505	1	`$fname='Parser::removeHTMLcomments';`
506	1	`wfProfileIn( $fname );`
507	1	`while (($start = strpos($text, '<!--')) !== false) {`
508	1	`$end = strpos($text, '-->', $start + 4);`
509	1	`if ($end === false) {`
510		`# Unterminated comment; bail out`
511		`break;`
512		`}`
513
514	1	`$end += 3;`
515
516		`# Trim space and newline if the comment is both`
517		`# preceded and followed by a newline`
518	1	`$spaceStart = max($start - 1, 0);`
519	1	`$spaceLen = $end - $spaceStart;`
520	1	`while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {`
521	1	`$spaceStart--;`
522	1	`$spaceLen++;`
523		`}`
524	1	`while (substr($text, $spaceStart + $spaceLen, 1) === ' ')`
525	1	`$spaceLen++;`
526	1	`if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {`
527		`# Remove the comment, leading and trailing`
528		`# spaces, and leave only one newline.`
529	1	`$text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);`
530		`}`
531		`else {`
532		`# Remove just the comment.`
533	1	`$text = substr_replace($text, '', $start, $end - $start);`
534		`}`
535		`}`
536	1	`wfProfileOut( $fname );`
537	1	`return $text;`
538		`}`
539
540		`/**`
541		`* Take an array of attribute names and values and normalize or discard`
542		`* illegal values for the given element type.`
543		`*`
544		`* - Discards attributes not on a whitelist for the given element`
545		`* - Unsafe style attributes are discarded`
546		`*`
547		`* @param array $attribs`
548		`* @param string $element`
549		`* @return array`
550		`*`
551		`* @todo Check for legal values where the DTD limits things.`
552		`* @todo Check for unique id attribute :P`
553		`*/`
554		`function validateTagAttributes( $attribs, $element ) {`
555	1	`$whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );`
556		`$out = array();`
557	1	`foreach( $attribs as $attribute => $value ) {`
558	1	`if( !isset( $whitelist[$attribute] ) ) {`
559	1	`continue;`
560		`}`
561		`# Strip javascript "expression" from stylesheets.`
562		`# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp`
563	1	`if( $attribute == 'style' ) {`
564	1	`$value = Sanitizer::checkCss( $value );`
565	1	`if( $value === false ) {`
566		`# haxx0r`
567	1	`continue;`
568		`}`
569		`}`
570
571	1	`if ( $attribute === 'id' )`
572	1	`$value = Sanitizer::escapeId( $value );`
573
574		`// If this attribute was previously set, override it.`
575		`// Output should only have one attribute of each name.`
576	1	`$out[$attribute] = $value;`
577		`}`
578	1	`return $out;`
579		`}`
580
581		`/**`
582		`* Pick apart some CSS and check it for forbidden or unsafe structures.`
583		`* Returns a sanitized string, or false if it was just too evil.`
584		`*`
585		`* Currently URL references, 'expression', 'tps' are forbidden.`
586		`*`
587		`* @param string $value`
588		`* @return mixed`
589		`*/`
590		`static function checkCss( $value ) {`
591	1	`$stripped = Sanitizer::decodeCharReferences( $value );`
592
593		`// Remove any comments; IE gets token splitting wrong`
594	1	`$stripped = preg_replace( '!/\\.?\\*/!S', ' ', $stripped );`
595	1	`$value = $stripped;`
596
597		`// ... and continue checks`
598	1	`$stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',`
599	1	`'codepointToUtf8(hexdec("$1"))', $stripped );`
600	1	`$stripped = str_replace( '\\', '', $stripped );`
601	1	`if( preg_match( '/(expression\|tps:\/\/\|url\\s\().*/is',`
602	1	`$stripped ) ) {`
603		`# haxx0r`
604	1	`return false;`
605		`}`
606
607	1	`return $value;`
608		`}`
609
610		`/**`
611		`* Take a tag soup fragment listing an HTML element's attributes`
612		`* and normalize it to well-formed XML, discarding unwanted attributes.`
613		`* Output is safe for further wikitext processing, with escaping of`
614		`* values that could trigger problems.`
615		`*`
616		`* - Normalizes attribute names to lowercase`
617		`* - Discards attributes not on a whitelist for the given element`
618		`* - Turns broken or invalid entities into plaintext`
619		`* - Double-quotes all attribute values`
620		`* - Attributes without values are given the name as attribute`
621		`* - Double attributes are discarded`
622		`* - Unsafe style attributes are discarded`
623		`* - Prepends space if there are attributes.`
624		`*`
625		`* @param string $text`
626		`* @param string $element`
627		`* @return string`
628		`*/`
629		`function fixTagAttributes( $text, $element ) {`
630	1	`if( trim( $text ) == '' ) {`
631	1	`return '';`
632		`}`
633
634	1	`$stripped = Sanitizer::validateTagAttributes(`
635	1	`Sanitizer::decodeTagAttributes( $text ), $element );`
636
637		`$attribs = array();`
638	1	`foreach( $stripped as $attribute => $value ) {`
639	1	`$encAttribute = htmlspecialchars( $attribute );`
640	1	`$encValue = Sanitizer::safeEncodeAttribute( $value );`
641
642	1	`$attribs[] = "$encAttribute=\"$encValue\"";`
643		`}`
644	1	`return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';`
645		`}`
646
647		`/**`
648		`* Encode an attribute value for HTML output.`
649		`* @param $text`
650		`* @return HTML-encoded text fragment`
651		`*/`
652		`function encodeAttribute( $text ) {`
653	1	`$encValue = htmlspecialchars( $text );`
654
655		`// Whitespace is normalized during attribute decoding,`
656		`// so if we've been passed non-spaces we must encode them`
657		`// ahead of time or they won't be preserved.`
658	1	`$encValue = strtr( $encValue, array(`
659		`"\n" => ' ',`
660		`"\r" => ' ',`
661		`"\t" => ' ',`
662	1	`) );`
663
664	1	`return $encValue;`
665		`}`
666
667		`/**`
668		`* Encode an attribute value for HTML tags, with extra armoring`
669		`* against further wiki processing.`
670		`* @param $text`
671		`* @return HTML-encoded text fragment`
672		`*/`
673		`function safeEncodeAttribute( $text ) {`
674	1	`$encValue = Sanitizer::encodeAttribute( $text );`
675
676		`# Templates and links may be expanded in later parsing,`
677		`# creating invalid or dangerous output. Suppress this.`
678	1	`$encValue = strtr( $encValue, array(`
679		`'<' => '<', // This should never happen,`
680		`'>' => '>', // we've received invalid input`
681		`'"' => '"', // which should have been escaped.`
682		`'{' => '{',`
683		`'[' => '[',`
684		`"''" => '''',`
685		`'ISBN' => 'ISBN',`
686		`'RFC' => 'RFC',`
687		`'PMID' => 'PMID',`
688		`'\|' => '\|',`
689		`'__' => '__',`
690	1	`) );`
691
692		`# Stupid hack`
693	1	`$encValue = preg_replace_callback(`
694	1	`'/(' . wfUrlProtocols() . ')/',`
695		`array( 'Sanitizer', 'armorLinksCallback' ),`
696	1	`$encValue );`
697	1	`return $encValue;`
698		`}`
699
700		`/**`
701		`* Given a value escape it so that it can be used in an id attribute and`
702		`* return it, this does not validate the value however (see first link)`
703		`*`
704		`* @link http://www.w3.org/TR/html401/types.html#type-name Valid characters`
705		`* in the id and`
706		`* name attributes`
707		`* @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute`
708		`*`
709		`* @bug 4461`
710		`*`
711		`* @static`
712		`*`
713		`* @param string $id`
714		`* @return string`
715		`*/`
716		`function escapeId( $id ) {`
717		`static $replace = array(`
718		`'%3A' => ':',`
719		`'%' => '.'`
720	1	`);`
721
722	1	`$id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );`
723
724	1	`return str_replace( array_keys( $replace ), array_values( $replace ), $id );`
725		`}`
726
727		`/**`
728		`* Regex replace callback for armoring links against further processing.`
729		`* @param array $matches`
730		`* @return string`
731		`* @private`
732		`*/`
733		`function armorLinksCallback( $matches ) {`
734	1	`return str_replace( ':', ':', $matches[1] );`
735		`}`
736
737		`/**`
738		`* Return an associative array of attribute names and values from`
739		`* a partial tag string. Attribute names are forces to lowercase,`
740		`* character references are decoded to UTF-8 text.`
741		`*`
742		`* @param string`
743		`* @return array`
744		`*/`
745		`function decodeTagAttributes( $text ) {`
746		`$attribs = array();`
747
748	1	`if( trim( $text ) == '' ) {`
749	1	`return $attribs;`
750		`}`
751
752		`$pairs = array();`
753	1	`if( !preg_match_all(`
754		`MW_ATTRIBS_REGEX,`
755		`$text,`
756		`$pairs,`
757	1	`PREG_SET_ORDER ) ) {`
758	1	`return $attribs;`
759		`}`
760
761	1	`foreach( $pairs as $set ) {`
762	1	`$attribute = strtolower( $set[1] );`
763	1	`$value = Sanitizer::getTagAttributeCallback( $set );`
764
765		`// Normalize whitespace`
766	1	`$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );`
767	1	`$value = trim( $value );`
768
769		`// Decode character references`
770	1	`$attribs[$attribute] = Sanitizer::decodeCharReferences( $value );`
771		`}`
772	1	`return $attribs;`
773		`}`
774
775		`/**`
776		`* Pick the appropriate attribute value from a match set from the`
777		`* MW_ATTRIBS_REGEX matches.`
778		`*`
779		`* @param array $set`
780		`* @return string`
781		`* @private`
782		`*/`
783		`function getTagAttributeCallback( $set ) {`
784	1	`if( isset( $set[6] ) ) {`
785		`# Illegal #XXXXXX color with no quotes.`
786		`return $set[6];`
787	1	`} elseif( isset( $set[5] ) ) {`
788		`# No quotes.`
789	1	`return $set[5];`
790	1	`} elseif( isset( $set[4] ) ) {`
791		`# Single-quoted`
792	1	`return $set[4];`
793	1	`} elseif( isset( $set[3] ) ) {`
794		`# Double-quoted`
795	1	`return $set[3];`
796	1	`} elseif( !isset( $set[2] ) ) {`
797		`# In XHTML, attributes must have a value.`
798		`# For 'reduced' form, return explicitly the attribute name here.`
799	1	`return $set[1];`
800		`} else {`
801		`throw new MWException( "Tag conditions not met. This should never happen and is a bug." );`
802		`}`
803		`}`
804
805		`/**`
806		`* Normalize whitespace and character references in an XML source-`
807		`* encoded text for an attribute value.`
808		`*`
809		`* See http://www.w3.org/TR/REC-xml/#AVNormalize for background,`
810		`* but note that we're not returning the value, but are returning`
811		`* XML source fragments that will be slapped into output.`
812		`*`
813		`* @param string $text`
814		`* @return string`
815		`* @private`
816		`*/`
817		`function normalizeAttributeValue( $text ) {`
818	1	`return str_replace( '"', '"',`
819		`preg_replace(`
820		`'/\r\n\|[\x20\x0d\x0a\x09]/',`
821		`' ',`
822	1	`Sanitizer::normalizeCharReferences( $text ) ) );`
823		`}`
824
825		`/**`
826		`* Ensure that any entities and character references are legal`
827		`* for XML and XHTML specifically. Any stray bits will be`
828		`* &-escaped to result in a valid text fragment.`
829		`*`
830		`* a. any named char refs must be known in XHTML`
831		`* b. any numeric char refs must be legal chars, not invalid or forbidden`
832		`* c. use &#x, not &#X`
833		`* d. fix or reject non-valid attributes`
834		`*`
835		`* @param string $text`
836		`* @return string`
837		`* @private`
838		`*/`
839		`function normalizeCharReferences( $text ) {`
840	1	`return preg_replace_callback(`
841		`MW_CHAR_REFS_REGEX,`
842		`array( 'Sanitizer', 'normalizeCharReferencesCallback' ),`
843	1	`$text );`
844		`}`
845		`/**`
846		`* @param string $matches`
847		`* @return string`
848		`*/`
849		`function normalizeCharReferencesCallback( $matches ) {`
850	1	`$ret = null;`
851	1	`if( $matches[1] != '' ) {`
852	1	`$ret = Sanitizer::normalizeEntity( $matches[1] );`
853	1	`} elseif( $matches[2] != '' ) {`
854	1	`$ret = Sanitizer::decCharReference( $matches[2] );`
855	1	`} elseif( $matches[3] != '' ) {`
856		`$ret = Sanitizer::hexCharReference( $matches[3] );`
857	1	`} elseif( $matches[4] != '' ) {`
858		`$ret = Sanitizer::hexCharReference( $matches[4] );`
859		`}`
860	1	`if( is_null( $ret ) ) {`
861	1	`return htmlspecialchars( $matches[0] );`
862		`} else {`
863	1	`return $ret;`
864		`}`
865		`}`
866
867		`/**`
868		`* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,`
869		`* return the named entity reference as is. Otherwise, returns`
870		`* HTML-escaped text of pseudo-entity source (eg &foo;)`
871		`*`
872		`* @param string $name`
873		`* @return string`
874		`*/`
875		`function normalizeEntity( $name ) {`
876	1	`global $wgHtmlEntities;`
877	1	`if( isset( $wgHtmlEntities[$name] ) ) {`
878	1	`return "&$name;";`
879		`} else {`
880	1	`return "&$name;";`
881		`}`
882		`}`
883
884		`function decCharReference( $codepoint ) {`
885	1	`$point = intval( $codepoint );`
886	1	`if( Sanitizer::validateCodepoint( $point ) ) {`
887	1	`return sprintf( '&#%d;', $point );`
888		`} else {`
889		`return null;`
890		`}`
891		`}`
892
893		`function hexCharReference( $codepoint ) {`
894		`$point = hexdec( $codepoint );`
895		`if( Sanitizer::validateCodepoint( $point ) ) {`
896		`return sprintf( '&#x%x;', $point );`
897		`} else {`
898		`return null;`
899		`}`
900		`}`
901
902		`/**`
903		`* Returns true if a given Unicode codepoint is a valid character in XML.`
904		`* @param int $codepoint`
905		`* @return bool`
906		`*/`
907		`function validateCodepoint( $codepoint ) {`
908	1	`return ($codepoint == 0x09)`
909	1	`\|\| ($codepoint == 0x0a)`
910	1	`\|\| ($codepoint == 0x0d)`
911	1	`\|\| ($codepoint >= 0x20 && $codepoint <= 0xd7ff)`
912		`\|\| ($codepoint >= 0xe000 && $codepoint <= 0xfffd)`
913		`\|\| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);`
914		`}`
915
916		`/**`
917		`* Decode any character references, numeric or named entities,`
918		`* in the text and return a UTF-8 string.`
919		`*`
920		`* @param string $text`
921		`* @return string`
922		`* @public`
923		`*/`
924		`function decodeCharReferences( $text ) {`
925	1	`return preg_replace_callback(`
926		`MW_CHAR_REFS_REGEX,`
927		`array( 'Sanitizer', 'decodeCharReferencesCallback' ),`
928	1	`$text );`
929		`}`
930
931		`/**`
932		`* @param string $matches`
933		`* @return string`
934		`*/`
935		`function decodeCharReferencesCallback( $matches ) {`
936	1	`if( $matches[1] != '' ) {`
937	1	`return Sanitizer::decodeEntity( $matches[1] );`
938	1	`} elseif( $matches[2] != '' ) {`
939	1	`return Sanitizer::decodeChar( intval( $matches[2] ) );`
940	1	`} elseif( $matches[3] != '' ) {`
941	1	`return Sanitizer::decodeChar( hexdec( $matches[3] ) );`
942	1	`} elseif( $matches[4] != '' ) {`
943		`return Sanitizer::decodeChar( hexdec( $matches[4] ) );`
944		`}`
945		`# Last case should be an ampersand by itself`
946	1	`return $matches[0];`
947		`}`
948
949		`/**`
950		`* Return UTF-8 string for a codepoint if that is a valid`
951		`* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.`
952		`* @param int $codepoint`
953		`* @return string`
954		`* @private`
955		`*/`
956		`function decodeChar( $codepoint ) {`
957	1	`if( Sanitizer::validateCodepoint( $codepoint ) ) {`
958	1	`return codepointToUtf8( $codepoint );`
959		`} else {`
960		`return UTF8_REPLACEMENT;`
961		`}`
962		`}`
963
964		`/**`
965		`* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,`
966		`* return the UTF-8 encoding of that character. Otherwise, returns`
967		`* pseudo-entity source (eg &foo;)`
968		`*`
969		`* @param string $name`
970		`* @return string`
971		`*/`
972		`function decodeEntity( $name ) {`
973	1	`global $wgHtmlEntities;`
974	1	`if( isset( $wgHtmlEntities[$name] ) ) {`
975	1	`return codepointToUtf8( $wgHtmlEntities[$name] );`
976		`} else {`
977		`return "&$name;";`
978		`}`
979		`}`
980
981		`/**`
982		`* Fetch the whitelist of acceptable attributes for a given`
983		`* element name.`
984		`*`
985		`* @param string $element`
986		`* @return array`
987		`*/`
988		`function attributeWhitelist( $element ) {`
989	1	`static $list;`
990	1	`if( !isset( $list ) ) {`
991	1	`$list = Sanitizer::setupAttributeWhitelist();`
992		`}`
993	1	`return isset( $list[$element] )`
994	1	`? $list[$element]`
995	1	`: array();`
996		`}`
997
998		`/**`
999		`* @return array`
1000		`*/`
1001		`function setupAttributeWhitelist() {`
1002		`$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );`
1003	1	`$block = array_merge( $common, array( 'align' ) );`
1004		`$tablealign = array( 'align', 'char', 'charoff', 'valign' );`
1005		`$tablecell = array( 'abbr',`
1006		`'axis',`
1007		`'headers',`
1008		`'scope',`
1009		`'rowspan',`
1010		`'colspan',`
1011		`'nowrap', # deprecated`
1012		`'width', # deprecated`
1013		`'height', # deprecated`
1014		`'bgcolor' # deprecated`
1015		`);`
1016
1017		`# Numbers refer to sections in HTML 4.01 standard describing the element.`
1018		`# See: http://www.w3.org/TR/html4/`
1019		`$whitelist = array (`
1020		`# 7.5.4`
1021	1	`'div' => $block,`
1022		`'center' => $common, # deprecated`
1023		`'span' => $block, # ??`
1024
1025		`# 7.5.5`
1026		`'h1' => $block,`
1027		`'h2' => $block,`
1028		`'h3' => $block,`
1029		`'h4' => $block,`
1030		`'h5' => $block,`
1031		`'h6' => $block,`
1032
1033		`# 7.5.6`
1034		`# address`
1035
1036		`# 8.2.4`
1037		`# bdo`
1038
1039		`# 9.2.1`
1040		`'em' => $common,`
1041		`'strong' => $common,`
1042		`'cite' => $common,`
1043		`# dfn`
1044		`'code' => $common,`
1045		`# samp`
1046		`# kbd`
1047		`'var' => $common,`
1048		`# abbr`
1049		`# acronym`
1050
1051		`# 9.2.2`
1052	1	`'blockquote' => array_merge( $common, array( 'cite' ) ),`
1053		`# q`
1054
1055		`# 9.2.3`
1056		`'sub' => $common,`
1057		`'sup' => $common,`
1058
1059		`# 9.3.1`
1060		`'p' => $block,`
1061
1062		`# 9.3.2`
1063		`'br' => array( 'id', 'class', 'title', 'style', 'clear' ),`
1064
1065		`# 9.3.4`
1066	1	`'pre' => array_merge( $common, array( 'width' ) ),`
1067
1068		`# 9.4`
1069	1	`'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),`
1070	1	`'del' => array_merge( $common, array( 'cite', 'datetime' ) ),`
1071
1072		`# 10.2`
1073	1	`'ul' => array_merge( $common, array( 'type' ) ),`
1074	1	`'ol' => array_merge( $common, array( 'type', 'start' ) ),`
1075	1	`'li' => array_merge( $common, array( 'type', 'value' ) ),`
1076
1077		`# 10.3`
1078		`'dl' => $common,`
1079		`'dd' => $common,`
1080		`'dt' => $common,`
1081
1082		`# 11.2.1`
1083		`'table' => array_merge( $common,`
1084		`array( 'summary', 'width', 'border', 'frame',`
1085		`'rules', 'cellspacing', 'cellpadding',`
1086		`'align', 'bgcolor', 'frame', 'rules',`
1087	1	`'border' ) ),`
1088
1089		`# 11.2.2`
1090	1	`'caption' => array_merge( $common, array( 'align' ) ),`
1091
1092		`# 11.2.3`
1093	1	`'thead' => array_merge( $common, $tablealign ),`
1094	1	`'tfoot' => array_merge( $common, $tablealign ),`
1095	1	`'tbody' => array_merge( $common, $tablealign ),`
1096
1097		`# 11.2.4`
1098	1	`'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),`
1099	1	`'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),`
1100
1101		`# 11.2.5`
1102	1	`'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),`
1103
1104		`# 11.2.6`
1105	1	`'td' => array_merge( $common, $tablecell, $tablealign ),`
1106	1	`'th' => array_merge( $common, $tablecell, $tablealign ),`
1107
1108		`# 15.2.1`
1109		`'tt' => $common,`
1110		`'b' => $common,`
1111		`'i' => $common,`
1112		`'big' => $common,`
1113		`'small' => $common,`
1114		`'strike' => $common,`
1115		`'s' => $common,`
1116		`'u' => $common,`
1117
1118		`# 15.2.2`
1119	1	`'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),`
1120		`# basefont`
1121
1122		`# 15.3`
1123	1	`'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),`
1124
1125		`# XHTML Ruby annotation text module, simple ruby only.`
1126		`# http://www.w3c.org/TR/ruby/`
1127		`'ruby' => $common,`
1128		`# rbc`
1129		`# rtc`
1130		`'rb' => $common,`
1131		`'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),`
1132		`'rp' => $common,`
1133		`);`
1134	1	`return $whitelist;`
1135		`}`
1136
1137		`/**`
1138		`* Take a fragment of (potentially invalid) HTML and return`
1139		`* a version with any tags removed, encoded suitably for literal`
1140		`* inclusion in an attribute value.`
1141		`*`
1142		`* @param string $text HTML fragment`
1143		`* @return string`
1144		`*/`
1145		`function stripAllTags( $text ) {`
1146		`# Actual <tags>`
1147	1	`$text = preg_replace( '/ < .*? > /x', '', $text );`
1148
1149		`# Normalize &entities and whitespace`
1150	1	`$text = Sanitizer::normalizeAttributeValue( $text );`
1151
1152		`# Will be placed into "double-quoted" attributes,`
1153		`# make sure remaining bits are safe.`
1154	1	`$text = str_replace(`
1155		`array('<', '>', '"'),`
1156		`array('<', '>', '"'),`
1157	1	`$text );`
1158
1159	1	`return $text;`
1160		`}`
1161
1162		`/**`
1163		`* Hack up a private DOCTYPE with HTML's standard entity declarations.`
1164		`* PHP 4 seemed to know these if you gave it an HTML doctype, but`
1165		`* PHP 5.1 doesn't.`
1166		`*`
1167		`* Use for passing XHTML fragments to PHP's XML parsing functions`
1168		`*`
1169		`* @return string`
1170		`* @static`
1171		`*/`
1172		`function hackDocType() {`
1173	1	`global $wgHtmlEntities;`
1174	1	`$out = "<!DOCTYPE html [\n";`
1175	1	`foreach( $wgHtmlEntities as $entity => $codepoint ) {`
1176	1	`$out .= "<!ENTITY $entity \"&#$codepoint;\">";`
1177		`}`
1178	1	`$out .= "]>\n";`
1179	1	`return $out;`
1180		`}`
1181
1182		`}`
1183
1184		`?>`