1: <?php
2:
3: 4: 5: 6: 7: 8: 9: 10: 11: 12:
13: class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
14: {
15:
16: public $tracksLineNumbers = true;
17:
18: 19: 20:
21: protected $_whitespace = "\x20\x09\x0D\x0A";
22:
23: 24: 25: 26:
27: protected function scriptCallback($matches) {
28: return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
29: }
30:
31: public function tokenizeHTML($html, $config, $context) {
32:
33:
34:
35:
36: if ($config->get('HTML.Trusted')) {
37: $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
38: array($this, 'scriptCallback'), $html);
39: }
40:
41: $html = $this->normalize($html, $config, $context);
42:
43: $cursor = 0;
44: $inside_tag = false;
45: $array = array();
46:
47:
48: $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
49:
50: if ($maintain_line_numbers === null) {
51:
52:
53: $maintain_line_numbers = $config->get('Core.CollectErrors');
54: }
55:
56: if ($maintain_line_numbers) {
57: $current_line = 1;
58: $current_col = 0;
59: $length = strlen($html);
60: } else {
61: $current_line = false;
62: $current_col = false;
63: $length = false;
64: }
65: $context->register('CurrentLine', $current_line);
66: $context->register('CurrentCol', $current_col);
67: $nl = "\n";
68:
69:
70: $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
71:
72: $e = false;
73: if ($config->get('Core.CollectErrors')) {
74: $e =& $context->get('ErrorCollector');
75: }
76:
77:
78: $loops = 0;
79:
80: while(++$loops) {
81:
82:
83:
84:
85:
86: if ($maintain_line_numbers) {
87:
88:
89: $rcursor = $cursor - (int) $inside_tag;
90:
91:
92:
93:
94:
95: $nl_pos = strrpos($html, $nl, $rcursor - $length);
96: $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
97:
98:
99: if (
100: $synchronize_interval &&
101: $cursor > 0 &&
102: $loops % $synchronize_interval === 0
103: ) {
104: $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
105: }
106:
107: }
108:
109: $position_next_lt = strpos($html, '<', $cursor);
110: $position_next_gt = strpos($html, '>', $cursor);
111:
112:
113:
114: if ($position_next_lt === $cursor) {
115: $inside_tag = true;
116: $cursor++;
117: }
118:
119: if (!$inside_tag && $position_next_lt !== false) {
120:
121: $token = new
122: HTMLPurifier_Token_Text(
123: $this->parseData(
124: substr(
125: $html, $cursor, $position_next_lt - $cursor
126: )
127: )
128: );
129: if ($maintain_line_numbers) {
130: $token->rawPosition($current_line, $current_col);
131: $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
132: }
133: $array[] = $token;
134: $cursor = $position_next_lt + 1;
135: $inside_tag = true;
136: continue;
137: } elseif (!$inside_tag) {
138:
139:
140: if ($cursor === strlen($html)) break;
141:
142: $token = new
143: HTMLPurifier_Token_Text(
144: $this->parseData(
145: substr(
146: $html, $cursor
147: )
148: )
149: );
150: if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
151: $array[] = $token;
152: break;
153: } elseif ($inside_tag && $position_next_gt !== false) {
154:
155:
156: $strlen_segment = $position_next_gt - $cursor;
157:
158: if ($strlen_segment < 1) {
159:
160: $token = new HTMLPurifier_Token_Text('<');
161: $cursor++;
162: continue;
163: }
164:
165: $segment = substr($html, $cursor, $strlen_segment);
166:
167: if ($segment === false) {
168:
169:
170: break;
171: }
172:
173:
174: if (
175: substr($segment, 0, 3) === '!--'
176: ) {
177:
178: $position_comment_end = strpos($html, '-->', $cursor);
179: if ($position_comment_end === false) {
180:
181:
182:
183: if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
184: $position_comment_end = strlen($html);
185: $end = true;
186: } else {
187: $end = false;
188: }
189: $strlen_segment = $position_comment_end - $cursor;
190: $segment = substr($html, $cursor, $strlen_segment);
191: $token = new
192: HTMLPurifier_Token_Comment(
193: substr(
194: $segment, 3, $strlen_segment - 3
195: )
196: );
197: if ($maintain_line_numbers) {
198: $token->rawPosition($current_line, $current_col);
199: $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
200: }
201: $array[] = $token;
202: $cursor = $end ? $position_comment_end : $position_comment_end + 3;
203: $inside_tag = false;
204: continue;
205: }
206:
207:
208: $is_end_tag = (strpos($segment,'/') === 0);
209: if ($is_end_tag) {
210: $type = substr($segment, 1);
211: $token = new HTMLPurifier_Token_End($type);
212: if ($maintain_line_numbers) {
213: $token->rawPosition($current_line, $current_col);
214: $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
215: }
216: $array[] = $token;
217: $inside_tag = false;
218: $cursor = $position_next_gt + 1;
219: continue;
220: }
221:
222:
223:
224:
225: if (!ctype_alpha($segment[0])) {
226:
227: if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
228: $token = new HTMLPurifier_Token_Text('<');
229: if ($maintain_line_numbers) {
230: $token->rawPosition($current_line, $current_col);
231: $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
232: }
233: $array[] = $token;
234: $inside_tag = false;
235: continue;
236: }
237:
238:
239:
240:
241:
242: $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
243: if ($is_self_closing) {
244: $strlen_segment--;
245: $segment = substr($segment, 0, $strlen_segment);
246: }
247:
248:
249: $position_first_space = strcspn($segment, $this->_whitespace);
250:
251: if ($position_first_space >= $strlen_segment) {
252: if ($is_self_closing) {
253: $token = new HTMLPurifier_Token_Empty($segment);
254: } else {
255: $token = new HTMLPurifier_Token_Start($segment);
256: }
257: if ($maintain_line_numbers) {
258: $token->rawPosition($current_line, $current_col);
259: $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
260: }
261: $array[] = $token;
262: $inside_tag = false;
263: $cursor = $position_next_gt + 1;
264: continue;
265: }
266:
267:
268: $type = substr($segment, 0, $position_first_space);
269: $attribute_string =
270: trim(
271: substr(
272: $segment, $position_first_space
273: )
274: );
275: if ($attribute_string) {
276: $attr = $this->parseAttributeString(
277: $attribute_string
278: , $config, $context
279: );
280: } else {
281: $attr = array();
282: }
283:
284: if ($is_self_closing) {
285: $token = new HTMLPurifier_Token_Empty($type, $attr);
286: } else {
287: $token = new HTMLPurifier_Token_Start($type, $attr);
288: }
289: if ($maintain_line_numbers) {
290: $token->rawPosition($current_line, $current_col);
291: $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
292: }
293: $array[] = $token;
294: $cursor = $position_next_gt + 1;
295: $inside_tag = false;
296: continue;
297: } else {
298:
299: if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
300: $token = new
301: HTMLPurifier_Token_Text(
302: '<' .
303: $this->parseData(
304: substr($html, $cursor)
305: )
306: );
307: if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
308:
309: $array[] = $token;
310: break;
311: }
312: break;
313: }
314:
315: $context->destroy('CurrentLine');
316: $context->destroy('CurrentCol');
317: return $array;
318: }
319:
320: 321: 322:
323: protected function substrCount($haystack, $needle, $offset, $length) {
324: static $oldVersion;
325: if ($oldVersion === null) {
326: $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
327: }
328: if ($oldVersion) {
329: $haystack = substr($haystack, $offset, $length);
330: return substr_count($haystack, $needle);
331: } else {
332: return substr_count($haystack, $needle, $offset, $length);
333: }
334: }
335:
336: 337: 338: 339: 340: 341:
342: public function parseAttributeString($string, $config, $context) {
343: $string = (string) $string;
344:
345: if ($string == '') return array();
346:
347: $e = false;
348: if ($config->get('Core.CollectErrors')) {
349: $e =& $context->get('ErrorCollector');
350: }
351:
352:
353:
354: $num_equal = substr_count($string, '=');
355: $has_space = strpos($string, ' ');
356: if ($num_equal === 0 && !$has_space) {
357:
358: return array($string => $string);
359: } elseif ($num_equal === 1 && !$has_space) {
360:
361: list($key, $quoted_value) = explode('=', $string);
362: $quoted_value = trim($quoted_value);
363: if (!$key) {
364: if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
365: return array();
366: }
367: if (!$quoted_value) return array($key => '');
368: $first_char = @$quoted_value[0];
369: $last_char = @$quoted_value[strlen($quoted_value)-1];
370:
371: $same_quote = ($first_char == $last_char);
372: $open_quote = ($first_char == '"' || $first_char == "'");
373:
374: if ( $same_quote && $open_quote) {
375:
376: $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
377: } else {
378:
379: if ($open_quote) {
380: if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
381: $value = substr($quoted_value, 1);
382: } else {
383: $value = $quoted_value;
384: }
385: }
386: if ($value === false) $value = '';
387: return array($key => $this->parseData($value));
388: }
389:
390:
391: $array = array();
392: $cursor = 0;
393: $size = strlen($string);
394:
395:
396:
397: $string .= ' ';
398:
399: while(true) {
400:
401: if ($cursor >= $size) {
402: break;
403: }
404:
405: $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
406:
407:
408: $key_begin = $cursor;
409:
410:
411: $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
412:
413: $key_end = $cursor;
414:
415: $key = substr($string, $key_begin, $key_end - $key_begin);
416:
417: if (!$key) {
418: if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
419: $cursor += strcspn($string, $this->_whitespace, $cursor + 1);
420: continue;
421: }
422:
423:
424: $cursor += strspn($string, $this->_whitespace, $cursor);
425:
426: if ($cursor >= $size) {
427: $array[$key] = $key;
428: break;
429: }
430:
431:
432:
433: $first_char = @$string[$cursor];
434:
435: if ($first_char == '=') {
436:
437:
438: $cursor++;
439: $cursor += strspn($string, $this->_whitespace, $cursor);
440:
441: if ($cursor === false) {
442: $array[$key] = '';
443: break;
444: }
445:
446:
447:
448: $char = @$string[$cursor];
449:
450: if ($char == '"' || $char == "'") {
451:
452: $cursor++;
453: $value_begin = $cursor;
454: $cursor = strpos($string, $char, $cursor);
455: $value_end = $cursor;
456: } else {
457:
458: $value_begin = $cursor;
459: $cursor += strcspn($string, $this->_whitespace, $cursor);
460: $value_end = $cursor;
461: }
462:
463:
464: if ($cursor === false) {
465: $cursor = $size;
466: $value_end = $cursor;
467: }
468:
469: $value = substr($string, $value_begin, $value_end - $value_begin);
470: if ($value === false) $value = '';
471: $array[$key] = $this->parseData($value);
472: $cursor++;
473:
474: } else {
475:
476: if ($key !== '') {
477: $array[$key] = $key;
478: } else {
479:
480: if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
481: }
482:
483: }
484: }
485: return $array;
486: }
487:
488: }
489:
490:
491: