1: <?php
2:
3: /**
4: * Parser that uses PHP 5's DOM extension (part of the core).
5: *
6: * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
7: * It gives us a forgiving HTML parser, which we use to transform the HTML
8: * into a DOM, and then into the tokens. It is blazingly fast (for large
9: * documents, it performs twenty times faster than
10: * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
11: *
12: * @note Any empty elements will have empty tokens associated with them, even if
13: * this is prohibited by the spec. This is cannot be fixed until the spec
14: * comes into play.
15: *
16: * @note PHP's DOM extension does not actually parse any entities, we use
17: * our own function to do that.
18: *
19: * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
20: * If this is a huge problem, due to the fact that HTML is hand
21: * edited and you are unable to get a parser cache that caches the
22: * the output of HTML Purifier while keeping the original HTML lying
23: * around, you may want to run Tidy on the resulting output or use
24: * HTMLPurifier_DirectLex
25: */
26:
27: class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
28: {
29:
30: private $factory;
31:
32: public function __construct() {
33: // setup the factory
34: parent::__construct();
35: $this->factory = new HTMLPurifier_TokenFactory();
36: }
37:
38: public function tokenizeHTML($html, $config, $context) {
39:
40: $html = $this->normalize($html, $config, $context);
41:
42: // attempt to armor stray angled brackets that cannot possibly
43: // form tags and thus are probably being used as emoticons
44: if ($config->get('Core.AggressivelyFixLt')) {
45: $char = '[^a-z!\/]';
46: $comment = "/<!--(.*?)(-->|\z)/is";
47: $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
48: do {
49: $old = $html;
50: $html = preg_replace("/<($char)/i", '<\\1', $html);
51: } while ($html !== $old);
52: $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
53: }
54:
55: // preprocess html, essential for UTF-8
56: $html = $this->wrapHTML($html, $config, $context);
57:
58: $doc = new DOMDocument();
59: $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
60:
61: set_error_handler(array($this, 'muteErrorHandler'));
62: $doc->loadHTML($html);
63: restore_error_handler();
64:
65: $tokens = array();
66: $this->tokenizeDOM(
67: $doc->getElementsByTagName('html')->item(0)-> // <html>
68: getElementsByTagName('body')->item(0)-> // <body>
69: getElementsByTagName('div')->item(0) // <div>
70: , $tokens);
71: return $tokens;
72: }
73:
74: /**
75: * Iterative function that tokenizes a node, putting it into an accumulator.
76: * To iterate is human, to recurse divine - L. Peter Deutsch
77: * @param $node DOMNode to be tokenized.
78: * @param $tokens Array-list of already tokenized tokens.
79: * @returns Tokens of node appended to previously passed tokens.
80: */
81: protected function tokenizeDOM($node, &$tokens) {
82:
83: $level = 0;
84: $nodes = array($level => array($node));
85: $closingNodes = array();
86: do {
87: while (!empty($nodes[$level])) {
88: $node = array_shift($nodes[$level]); // FIFO
89: $collect = $level > 0 ? true : false;
90: $needEndingTag = $this->createStartNode($node, $tokens, $collect);
91: if ($needEndingTag) {
92: $closingNodes[$level][] = $node;
93: }
94: if ($node->childNodes && $node->childNodes->length) {
95: $level++;
96: $nodes[$level] = array();
97: foreach ($node->childNodes as $childNode) {
98: array_push($nodes[$level], $childNode);
99: }
100: }
101: }
102: $level--;
103: if ($level && isset($closingNodes[$level])) {
104: while($node = array_pop($closingNodes[$level])) {
105: $this->createEndNode($node, $tokens);
106: }
107: }
108: } while ($level > 0);
109: }
110:
111: /**
112: * @param $node DOMNode to be tokenized.
113: * @param $tokens Array-list of already tokenized tokens.
114: * @param $collect Says whether or start and close are collected, set to
115: * false at first recursion because it's the implicit DIV
116: * tag you're dealing with.
117: * @returns bool if the token needs an endtoken
118: */
119: protected function createStartNode($node, &$tokens, $collect) {
120: // intercept non element nodes. WE MUST catch all of them,
121: // but we're not getting the character reference nodes because
122: // those should have been preprocessed
123: if ($node->nodeType === XML_TEXT_NODE) {
124: $tokens[] = $this->factory->createText($node->data);
125: return false;
126: } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
127: // undo libxml's special treatment of <script> and <style> tags
128: $last = end($tokens);
129: $data = $node->data;
130: // (note $node->tagname is already normalized)
131: if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
132: $new_data = trim($data);
133: if (substr($new_data, 0, 4) === '<!--') {
134: $data = substr($new_data, 4);
135: if (substr($data, -3) === '-->') {
136: $data = substr($data, 0, -3);
137: } else {
138: // Highly suspicious! Not sure what to do...
139: }
140: }
141: }
142: $tokens[] = $this->factory->createText($this->parseData($data));
143: return false;
144: } elseif ($node->nodeType === XML_COMMENT_NODE) {
145: // this is code is only invoked for comments in script/style in versions
146: // of libxml pre-2.6.28 (regular comments, of course, are still
147: // handled regularly)
148: $tokens[] = $this->factory->createComment($node->data);
149: return false;
150: } elseif (
151: // not-well tested: there may be other nodes we have to grab
152: $node->nodeType !== XML_ELEMENT_NODE
153: ) {
154: return false;
155: }
156:
157: $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
158:
159: // We still have to make sure that the element actually IS empty
160: if (!$node->childNodes->length) {
161: if ($collect) {
162: $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
163: }
164: return false;
165: } else {
166: if ($collect) {
167: $tokens[] = $this->factory->createStart(
168: $tag_name = $node->tagName, // somehow, it get's dropped
169: $attr
170: );
171: }
172: return true;
173: }
174: }
175:
176: protected function createEndNode($node, &$tokens) {
177: $tokens[] = $this->factory->createEnd($node->tagName);
178: }
179:
180:
181: /**
182: * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
183: *
184: * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
185: * @returns Associative array of attributes.
186: */
187: protected function transformAttrToAssoc($node_map) {
188: // NamedNodeMap is documented very well, so we're using undocumented
189: // features, namely, the fact that it implements Iterator and
190: // has a ->length attribute
191: if ($node_map->length === 0) return array();
192: $array = array();
193: foreach ($node_map as $attr) {
194: $array[$attr->name] = $attr->value;
195: }
196: return $array;
197: }
198:
199: /**
200: * An error handler that mutes all errors
201: */
202: public function muteErrorHandler($errno, $errstr) {}
203:
204: /**
205: * Callback function for undoing escaping of stray angled brackets
206: * in comments
207: */
208: public function callbackUndoCommentSubst($matches) {
209: return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2];
210: }
211:
212: /**
213: * Callback function that entity-izes ampersands in comments so that
214: * callbackUndoCommentSubst doesn't clobber them
215: */
216: public function callbackArmorCommentEntities($matches) {
217: return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
218: }
219:
220: /**
221: * Wraps an HTML fragment in the necessary HTML
222: */
223: protected function wrapHTML($html, $config, $context) {
224: $def = $config->getDefinition('HTML');
225: $ret = '';
226:
227: if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
228: $ret .= '<!DOCTYPE html ';
229: if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
230: if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
231: $ret .= '>';
232: }
233:
234: $ret .= '<html><head>';
235: $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
236: // No protection if $html contains a stray </div>!
237: $ret .= '</head><body><div>'.$html.'</div></body></html>';
238: return $ret;
239: }
240:
241: }
242:
243: // vim: et sw=4 sts=4
244: