1: <?php
2:
3: // if want to implement error collecting here, we'll need to use some sort
4: // of global data (probably trigger_error) because it's impossible to pass
5: // $config or $context to the callback functions.
6:
7: /**
8: * Handles referencing and derefencing character entities
9: */
10: class HTMLPurifier_EntityParser
11: {
12:
13: /**
14: * Reference to entity lookup table.
15: */
16: protected $_entity_lookup;
17:
18: /**
19: * Callback regex string for parsing entities.
20: */
21: protected $_substituteEntitiesRegex =
22: '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
23: // 1. hex 2. dec 3. string (XML style)
24:
25:
26: /**
27: * Decimal to parsed string conversion table for special entities.
28: */
29: protected $_special_dec2str =
30: array(
31: 34 => '"',
32: 38 => '&',
33: 39 => "'",
34: 60 => '<',
35: 62 => '>'
36: );
37:
38: /**
39: * Stripped entity names to decimal conversion table for special entities.
40: */
41: protected $_special_ent2dec =
42: array(
43: 'quot' => 34,
44: 'amp' => 38,
45: 'lt' => 60,
46: 'gt' => 62
47: );
48:
49: /**
50: * Substitutes non-special entities with their parsed equivalents. Since
51: * running this whenever you have parsed character is t3h 5uck, we run
52: * it before everything else.
53: *
54: * @param $string String to have non-special entities parsed.
55: * @returns Parsed string.
56: */
57: public function substituteNonSpecialEntities($string) {
58: // it will try to detect missing semicolons, but don't rely on it
59: return preg_replace_callback(
60: $this->_substituteEntitiesRegex,
61: array($this, 'nonSpecialEntityCallback'),
62: $string
63: );
64: }
65:
66: /**
67: * Callback function for substituteNonSpecialEntities() that does the work.
68: *
69: * @param $matches PCRE matches array, with 0 the entire match, and
70: * either index 1, 2 or 3 set with a hex value, dec value,
71: * or string (respectively).
72: * @returns Replacement string.
73: */
74:
75: protected function nonSpecialEntityCallback($matches) {
76: // replaces all but big five
77: $entity = $matches[0];
78: $is_num = (@$matches[0][1] === '#');
79: if ($is_num) {
80: $is_hex = (@$entity[2] === 'x');
81: $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
82:
83: // abort for special characters
84: if (isset($this->_special_dec2str[$code])) return $entity;
85:
86: return HTMLPurifier_Encoder::unichr($code);
87: } else {
88: if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
89: if (!$this->_entity_lookup) {
90: $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
91: }
92: if (isset($this->_entity_lookup->table[$matches[3]])) {
93: return $this->_entity_lookup->table[$matches[3]];
94: } else {
95: return $entity;
96: }
97: }
98: }
99:
100: /**
101: * Substitutes only special entities with their parsed equivalents.
102: *
103: * @notice We try to avoid calling this function because otherwise, it
104: * would have to be called a lot (for every parsed section).
105: *
106: * @param $string String to have non-special entities parsed.
107: * @returns Parsed string.
108: */
109: public function substituteSpecialEntities($string) {
110: return preg_replace_callback(
111: $this->_substituteEntitiesRegex,
112: array($this, 'specialEntityCallback'),
113: $string);
114: }
115:
116: /**
117: * Callback function for substituteSpecialEntities() that does the work.
118: *
119: * This callback has same syntax as nonSpecialEntityCallback().
120: *
121: * @param $matches PCRE-style matches array, with 0 the entire match, and
122: * either index 1, 2 or 3 set with a hex value, dec value,
123: * or string (respectively).
124: * @returns Replacement string.
125: */
126: protected function specialEntityCallback($matches) {
127: $entity = $matches[0];
128: $is_num = (@$matches[0][1] === '#');
129: if ($is_num) {
130: $is_hex = (@$entity[2] === 'x');
131: $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
132: return isset($this->_special_dec2str[$int]) ?
133: $this->_special_dec2str[$int] :
134: $entity;
135: } else {
136: return isset($this->_special_ent2dec[$matches[3]]) ?
137: $this->_special_ent2dec[$matches[3]] :
138: $entity;
139: }
140: }
141:
142: }
143:
144: // vim: et sw=4 sts=4
145: