1: <?php
2:
3: /**
4: * Injector that auto paragraphs text in the root node based on
5: * double-spacing.
6: * @todo Ensure all states are unit tested, including variations as well.
7: * @todo Make a graph of the flow control for this Injector.
8: */
9: class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
10: {
11:
12: public $name = 'AutoParagraph';
13: public $needed = array('p');
14:
15: private function _pStart() {
16: $par = new HTMLPurifier_Token_Start('p');
17: $par->armor['MakeWellFormed_TagClosedError'] = true;
18: return $par;
19: }
20:
21: public function handleText(&$token) {
22: $text = $token->data;
23: // Does the current parent allow <p> tags?
24: if ($this->allowsElement('p')) {
25: if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
26: // Note that we have differing behavior when dealing with text
27: // in the anonymous root node, or a node inside the document.
28: // If the text as a double-newline, the treatment is the same;
29: // if it doesn't, see the next if-block if you're in the document.
30:
31: $i = $nesting = null;
32: if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
33: // State 1.1: ... ^ (whitespace, then document end)
34: // ----
35: // This is a degenerate case
36: } else {
37: if (!$token->is_whitespace || $this->_isInline($current)) {
38: // State 1.2: PAR1
39: // ----
40:
41: // State 1.3: PAR1\n\nPAR2
42: // ------------
43:
44: // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
45: // ------------
46: $token = array($this->_pStart());
47: $this->_splitText($text, $token);
48: } else {
49: // State 1.5: \n<hr />
50: // --
51: }
52: }
53: } else {
54: // State 2: <div>PAR1... (similar to 1.4)
55: // ----
56:
57: // We're in an element that allows paragraph tags, but we're not
58: // sure if we're going to need them.
59: if ($this->_pLookAhead()) {
60: // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
61: // ----
62: // Note: This will always be the first child, since any
63: // previous inline element would have triggered this very
64: // same routine, and found the double newline. One possible
65: // exception would be a comment.
66: $token = array($this->_pStart(), $token);
67: } else {
68: // State 2.2.1: <div>PAR1<div>
69: // ----
70:
71: // State 2.2.2: <div>PAR1<b>PAR1</b></div>
72: // ----
73: }
74: }
75: // Is the current parent a <p> tag?
76: } elseif (
77: !empty($this->currentNesting) &&
78: $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
79: ) {
80: // State 3.1: ...<p>PAR1
81: // ----
82:
83: // State 3.2: ...<p>PAR1\n\nPAR2
84: // ------------
85: $token = array();
86: $this->_splitText($text, $token);
87: // Abort!
88: } else {
89: // State 4.1: ...<b>PAR1
90: // ----
91:
92: // State 4.2: ...<b>PAR1\n\nPAR2
93: // ------------
94: }
95: }
96:
97: public function handleElement(&$token) {
98: // We don't have to check if we're already in a <p> tag for block
99: // tokens, because the tag would have been autoclosed by MakeWellFormed.
100: if ($this->allowsElement('p')) {
101: if (!empty($this->currentNesting)) {
102: if ($this->_isInline($token)) {
103: // State 1: <div>...<b>
104: // ---
105:
106: // Check if this token is adjacent to the parent token
107: // (seek backwards until token isn't whitespace)
108: $i = null;
109: $this->backward($i, $prev);
110:
111: if (!$prev instanceof HTMLPurifier_Token_Start) {
112: // Token wasn't adjacent
113:
114: if (
115: $prev instanceof HTMLPurifier_Token_Text &&
116: substr($prev->data, -2) === "\n\n"
117: ) {
118: // State 1.1.4: <div><p>PAR1</p>\n\n<b>
119: // ---
120:
121: // Quite frankly, this should be handled by splitText
122: $token = array($this->_pStart(), $token);
123: } else {
124: // State 1.1.1: <div><p>PAR1</p><b>
125: // ---
126:
127: // State 1.1.2: <div><br /><b>
128: // ---
129:
130: // State 1.1.3: <div>PAR<b>
131: // ---
132: }
133:
134: } else {
135: // State 1.2.1: <div><b>
136: // ---
137:
138: // Lookahead to see if <p> is needed.
139: if ($this->_pLookAhead()) {
140: // State 1.3.1: <div><b>PAR1\n\nPAR2
141: // ---
142: $token = array($this->_pStart(), $token);
143: } else {
144: // State 1.3.2: <div><b>PAR1</b></div>
145: // ---
146:
147: // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
148: // ---
149: }
150: }
151: } else {
152: // State 2.3: ...<div>
153: // -----
154: }
155: } else {
156: if ($this->_isInline($token)) {
157: // State 3.1: <b>
158: // ---
159: // This is where the {p} tag is inserted, not reflected in
160: // inputTokens yet, however.
161: $token = array($this->_pStart(), $token);
162: } else {
163: // State 3.2: <div>
164: // -----
165: }
166:
167: $i = null;
168: if ($this->backward($i, $prev)) {
169: if (
170: !$prev instanceof HTMLPurifier_Token_Text
171: ) {
172: // State 3.1.1: ...</p>{p}<b>
173: // ---
174:
175: // State 3.2.1: ...</p><div>
176: // -----
177:
178: if (!is_array($token)) $token = array($token);
179: array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
180: } else {
181: // State 3.1.2: ...</p>\n\n{p}<b>
182: // ---
183:
184: // State 3.2.2: ...</p>\n\n<div>
185: // -----
186:
187: // Note: PAR<ELEM> cannot occur because PAR would have been
188: // wrapped in <p> tags.
189: }
190: }
191: }
192: } else {
193: // State 2.2: <ul><li>
194: // ----
195:
196: // State 2.4: <p><b>
197: // ---
198: }
199: }
200:
201: /**
202: * Splits up a text in paragraph tokens and appends them
203: * to the result stream that will replace the original
204: * @param $data String text data that will be processed
205: * into paragraphs
206: * @param $result Reference to array of tokens that the
207: * tags will be appended onto
208: * @param $config Instance of HTMLPurifier_Config
209: * @param $context Instance of HTMLPurifier_Context
210: */
211: private function _splitText($data, &$result) {
212: $raw_paragraphs = explode("\n\n", $data);
213: $paragraphs = array(); // without empty paragraphs
214: $needs_start = false;
215: $needs_end = false;
216:
217: $c = count($raw_paragraphs);
218: if ($c == 1) {
219: // There were no double-newlines, abort quickly. In theory this
220: // should never happen.
221: $result[] = new HTMLPurifier_Token_Text($data);
222: return;
223: }
224: for ($i = 0; $i < $c; $i++) {
225: $par = $raw_paragraphs[$i];
226: if (trim($par) !== '') {
227: $paragraphs[] = $par;
228: } else {
229: if ($i == 0) {
230: // Double newline at the front
231: if (empty($result)) {
232: // The empty result indicates that the AutoParagraph
233: // injector did not add any start paragraph tokens.
234: // This means that we have been in a paragraph for
235: // a while, and the newline means we should start a new one.
236: $result[] = new HTMLPurifier_Token_End('p');
237: $result[] = new HTMLPurifier_Token_Text("\n\n");
238: // However, the start token should only be added if
239: // there is more processing to be done (i.e. there are
240: // real paragraphs in here). If there are none, the
241: // next start paragraph tag will be handled by the
242: // next call to the injector
243: $needs_start = true;
244: } else {
245: // We just started a new paragraph!
246: // Reinstate a double-newline for presentation's sake, since
247: // it was in the source code.
248: array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
249: }
250: } elseif ($i + 1 == $c) {
251: // Double newline at the end
252: // There should be a trailing </p> when we're finally done.
253: $needs_end = true;
254: }
255: }
256: }
257:
258: // Check if this was just a giant blob of whitespace. Move this earlier,
259: // perhaps?
260: if (empty($paragraphs)) {
261: return;
262: }
263:
264: // Add the start tag indicated by \n\n at the beginning of $data
265: if ($needs_start) {
266: $result[] = $this->_pStart();
267: }
268:
269: // Append the paragraphs onto the result
270: foreach ($paragraphs as $par) {
271: $result[] = new HTMLPurifier_Token_Text($par);
272: $result[] = new HTMLPurifier_Token_End('p');
273: $result[] = new HTMLPurifier_Token_Text("\n\n");
274: $result[] = $this->_pStart();
275: }
276:
277: // Remove trailing start token; Injector will handle this later if
278: // it was indeed needed. This prevents from needing to do a lookahead,
279: // at the cost of a lookbehind later.
280: array_pop($result);
281:
282: // If there is no need for an end tag, remove all of it and let
283: // MakeWellFormed close it later.
284: if (!$needs_end) {
285: array_pop($result); // removes \n\n
286: array_pop($result); // removes </p>
287: }
288:
289: }
290:
291: /**
292: * Returns true if passed token is inline (and, ergo, allowed in
293: * paragraph tags)
294: */
295: private function _isInline($token) {
296: return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
297: }
298:
299: /**
300: * Looks ahead in the token list and determines whether or not we need
301: * to insert a <p> tag.
302: */
303: private function _pLookAhead() {
304: $this->current($i, $current);
305: if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
306: else $nesting = 0;
307: $ok = false;
308: while ($this->forwardUntilEndToken($i, $current, $nesting)) {
309: $result = $this->_checkNeedsP($current);
310: if ($result !== null) {
311: $ok = $result;
312: break;
313: }
314: }
315: return $ok;
316: }
317:
318: /**
319: * Determines if a particular token requires an earlier inline token
320: * to get a paragraph. This should be used with _forwardUntilEndToken
321: */
322: private function _checkNeedsP($current) {
323: if ($current instanceof HTMLPurifier_Token_Start){
324: if (!$this->_isInline($current)) {
325: // <div>PAR1<div>
326: // ----
327: // Terminate early, since we hit a block element
328: return false;
329: }
330: } elseif ($current instanceof HTMLPurifier_Token_Text) {
331: if (strpos($current->data, "\n\n") !== false) {
332: // <div>PAR1<b>PAR1\n\nPAR2
333: // ----
334: return true;
335: } else {
336: // <div>PAR1<b>PAR1...
337: // ----
338: }
339: }
340: return null;
341: }
342:
343: }
344:
345: // vim: et sw=4 sts=4
346: