Elgg  Version 2.3
ElggAutoP.php
Go to the documentation of this file.
1 <?php
2 
14 class ElggAutoP {
15 
16  public $encoding = 'UTF-8';
17 
21  protected $_doc = null;
22 
26  protected $_xpath = null;
27 
28  protected $_blocks = 'address article area aside blockquote caption col colgroup dd
29  details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
30  hr hgroup legend map math menu nav noscript p pre section select style summary
31  table tbody td tfoot th thead tr ul ol option li';
32 
36  protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
37  del dfn em embed i iframe img input ins kbd keygen label map mark meter object
38  output progress q rp rt ruby s samp script select small source span strong style
39  sub sup textarea time var video wbr';
40 
46  protected $_descendList = 'article aside blockquote body details div footer form
47  header section';
48 
54  protected $_alterList = 'article aside blockquote body details div footer header
55  section';
56 
58  protected $_unique = '';
59 
63  public function __construct() {
64  $this->_blocks = preg_split('@\\s+@', $this->_blocks);
65  $this->_descendList = preg_split('@\\s+@', $this->_descendList);
66  $this->_alterList = preg_split('@\\s+@', $this->_alterList);
67  $this->_inlines = preg_split('@\\s+@', $this->_inlines);
68  $this->_unique = md5(__FILE__);
69  }
70 
82  public function process($html) {
83  // normalize whitespace
84  $html = str_replace(array("\r\n", "\r"), "\n", $html);
85 
86  // allows preserving entities untouched
87  $html = str_replace('&', $this->_unique . 'AMP', $html);
88 
89  $this->_doc = new DOMDocument();
90 
91  // parse to DOM, suppressing loadHTML warnings
92  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
93  $use_internal_errors = libxml_use_internal_errors(true);
94 
95  // Do not load entities. May be unnecessary, better safe than sorry
96  $disable_load_entities = libxml_disable_entity_loader(true);
97 
98  if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' "
99  . "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
100  . "</html>")) {
101 
102  libxml_use_internal_errors($use_internal_errors);
103  libxml_disable_entity_loader($disable_load_entities);
104  return false;
105  }
106 
107  libxml_use_internal_errors($use_internal_errors);
108  libxml_disable_entity_loader($disable_load_entities);
109 
110  $this->_xpath = new DOMXPath($this->_doc);
111  // start processing recursively at the BODY element
112  $nodeList = $this->_xpath->query('//body[1]');
113  $this->addParagraphs($nodeList->item(0));
114 
115  // serialize back to HTML
116  $html = $this->_doc->saveHTML();
117 
118  // Note: we create <autop> elements, which will later be converted to paragraphs
119 
120  // split AUTOPs into multiples at /\n\n+/
121  $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
122  $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '<br>'),
123  '<br />',
124  $html);
125  $html = str_replace('<br /></autop>', '</autop>', $html);
126 
127  // re-parse so we can handle new AUTOP elements
128 
129  // parse to DOM, suppressing loadHTML warnings
130  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
131  $use_internal_errors = libxml_use_internal_errors(true);
132 
133  // Do not load entities. May be unnecessary, better safe than sorry
134  $disable_load_entities = libxml_disable_entity_loader(true);
135 
136  if (!$this->_doc->loadHTML($html)) {
137  libxml_use_internal_errors($use_internal_errors);
138  libxml_disable_entity_loader($disable_load_entities);
139  return false;
140  }
141 
142  libxml_use_internal_errors($use_internal_errors);
143  libxml_disable_entity_loader($disable_load_entities);
144 
145  // must re-create XPath object after DOM load
146  $this->_xpath = new DOMXPath($this->_doc);
147 
148  // strip AUTOPs that only have comments/whitespace
149  foreach ($this->_xpath->query('//autop') as $autop) {
150  /* @var DOMElement $autop */
151  $hasContent = false;
152  if (trim($autop->textContent) !== '') {
153  $hasContent = true;
154  } else {
155  foreach ($autop->childNodes as $node) {
156  if ($node->nodeType === XML_ELEMENT_NODE) {
157  $hasContent = true;
158  break;
159  }
160  }
161  }
162  if (!$hasContent) {
163  // mark to be later replaced w/ preg_replace (faster than moving nodes out)
164  $autop->setAttribute("r", "1");
165  }
166  }
167 
168  // If a DIV contains a single AUTOP, remove it
169  foreach ($this->_xpath->query('//div') as $el) {
170  /* @var DOMElement $el */
171  $autops = $this->_xpath->query('./autop', $el);
172  if ($autops->length === 1) {
173  $firstAutop = $autops->item(0);
174  /* @var DOMElement $firstAutop */
175  $firstAutop->setAttribute("r", "1");
176  }
177  }
178 
179  $html = $this->_doc->saveHTML();
180 
181  // trim to the contents of BODY
182  $bodyStart = strpos($html, '<body>');
183  $bodyEnd = strpos($html, '</body>', $bodyStart + 6);
184  $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
185 
186  // strip AUTOPs that should be removed
187  $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
188 
189  // commit to converting AUTOPs to Ps
190  $html = str_replace('<autop>', "\n<p>", $html);
191  $html = str_replace('</autop>', "</p>\n", $html);
192 
193  $html = str_replace('<br>', '<br />', $html);
194  $html = str_replace($this->_unique . 'AMP', '&', $html);
195  return $html;
196  }
197 
204  protected function addParagraphs(DOMElement $el) {
205  // no need to call recursively, just queue up
206  $elsToProcess = array($el);
207  $inlinesToProcess = array();
208  while ($el = array_shift($elsToProcess)) {
209  // if true, we can alter all child nodes, if not, we'll just call
210  // addParagraphs on each element in the descendInto list
211  $alterInline = in_array($el->nodeName, $this->_alterList);
212 
213  // inside affected elements, we want to trim leading whitespace from
214  // the first text node
215  $ltrimFirstTextNode = true;
216 
217  // should we open a new AUTOP element to move inline elements into?
218  $openP = true;
219  $autop = null;
220 
221  // after BR, ignore a newline
222  $isFollowingBr = false;
223 
224  $node = $el->firstChild;
225  while (null !== $node) {
226  if ($alterInline) {
227  if ($openP) {
228  $openP = false;
229  // create a P to move inline content into (this may be removed later)
230  $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
231  }
232  }
233 
234  $isElement = ($node->nodeType === XML_ELEMENT_NODE);
235  if ($isElement) {
236  $isBlock = in_array($node->nodeName, $this->_blocks);
237  if (!$isBlock) {
238  // if we start with an inline element we don't need to do this
239  $ltrimFirstTextNode = false;
240  }
241  } else {
242  $isBlock = false;
243  }
244 
245  if ($alterInline) {
246  $isText = ($node->nodeType === XML_TEXT_NODE);
247  $isLastInline = (! $node->nextSibling
248  || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
249  && in_array($node->nextSibling->nodeName, $this->_blocks)));
250  if ($isElement) {
251  $isFollowingBr = ($node->nodeName === 'br');
252  }
253 
254  if ($isText) {
255  $nodeText = $node->nodeValue;
256 
257  if ($ltrimFirstTextNode) {
258  // we're at the beginning of a sequence of text/inline elements
259  $nodeText = ltrim($nodeText);
260  $ltrimFirstTextNode = false;
261  }
262  if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
263  // if a user ends a line with <br>, don't add a second BR
264  $nodeText = substr($nodeText, strlen($m[0]));
265  }
266  if ($isLastInline) {
267  // we're at the end of a sequence of text/inline elements
268  $nodeText = rtrim($nodeText);
269  }
270  $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
271  $tmpNode = $node;
272  $node = $node->nextSibling; // move loop to next node
273 
274  // alter node in place, then move into AUTOP
275  $tmpNode->nodeValue = $nodeText;
276  $autop->appendChild($tmpNode);
277 
278  continue;
279  }
280  }
281  if ($isBlock || ! $node->nextSibling) {
282  if ($isBlock) {
283  if (in_array($node->nodeName, $this->_descendList)) {
284  $elsToProcess[] = $node;
285  //$this->addParagraphs($node);
286  }
287  }
288  $openP = true;
289  $ltrimFirstTextNode = true;
290  }
291  if ($alterInline) {
292  if (! $isBlock) {
293  $tmpNode = $node;
294  if ($isElement && false !== strpos($tmpNode->textContent, "\n")) {
295  $inlinesToProcess[] = $tmpNode;
296  }
297  $node = $node->nextSibling;
298  $autop->appendChild($tmpNode);
299  continue;
300  }
301  }
302 
303  $node = $node->nextSibling;
304  }
305  }
306 
307  // handle inline nodes
308  // no need to recurse, just queue up
309  while ($el = array_shift($inlinesToProcess)) {
310  $ignoreLeadingNewline = false;
311  foreach ($el->childNodes as $node) {
312  if ($node->nodeType === XML_ELEMENT_NODE) {
313  if ($node->nodeValue === 'BR') {
314  $ignoreLeadingNewline = true;
315  } else {
316  $ignoreLeadingNewline = false;
317  if (false !== strpos($node->textContent, "\n")) {
318  $inlinesToProcess[] = $node;
319  }
320  }
321  continue;
322  } elseif ($node->nodeType === XML_TEXT_NODE) {
323  $text = $node->nodeValue;
324  if ($text[0] === "\n" && $ignoreLeadingNewline) {
325  $text = substr($text, 1);
326  $ignoreLeadingNewline = false;
327  }
328  $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
329  }
330  }
331  }
332  }
333 }
$m
Definition: metadata.php:11
$autop
Definition: longtext.php:24
process($html)
Create wrapper P and BR elements in HTML depending on newlines.
Definition: ElggAutoP.php:82
$text
Definition: default.php:25
foreach($emails as $email) $html
Definition: exceptions.php:34
addParagraphs(DOMElement $el)
Add P and BR elements as necessary.
Definition: ElggAutoP.php:204
__construct()
Constructor.
Definition: ElggAutoP.php:63