Elgg  Version 3.0
ElggAutoP.php
Go to the documentation of this file.
1 <?php
2 
14 class ElggAutoP {
15 
19  public $encoding = 'UTF-8';
20 
24  protected $_doc = null;
25 
29  protected $_xpath = null;
30 
34  protected $_blocks = 'address article area aside blockquote caption col colgroup dd
35  details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
36  hr hgroup legend map math menu nav noscript p pre section select style summary
37  table tbody td tfoot th thead tr ul ol option li';
38 
42  protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
43  del dfn em embed i iframe img input ins kbd keygen label map mark meter object
44  output progress q rp rt ruby s samp script select small source span strong style
45  sub sup textarea time var video wbr';
46 
52  protected $_descendList = 'article aside blockquote body details div footer form
53  header section';
54 
60  protected $_alterList = 'article aside blockquote body details div footer header
61  section';
62 
66  protected $_unique = '';
67 
71  public function __construct() {
72  $this->_blocks = preg_split('@\\s+@', $this->_blocks);
73  $this->_descendList = preg_split('@\\s+@', $this->_descendList);
74  $this->_alterList = preg_split('@\\s+@', $this->_alterList);
75  $this->_inlines = preg_split('@\\s+@', $this->_inlines);
76  $this->_unique = md5(__FILE__);
77  }
78 
90  public function process($html) {
91  // normalize whitespace
92  $html = str_replace(["\r\n", "\r"], "\n", $html);
93 
94  // allows preserving entities untouched
95  $html = str_replace('&', $this->_unique . 'AMP', $html);
96 
97  $this->_doc = new DOMDocument();
98 
99  // parse to DOM, suppressing loadHTML warnings
100  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
101  $use_internal_errors = libxml_use_internal_errors(true);
102 
103  // Do not load entities. May be unnecessary, better safe than sorry
104  $disable_load_entities = libxml_disable_entity_loader(true);
105 
106  if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' "
107  . "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
108  . "</html>")) {
109  libxml_use_internal_errors($use_internal_errors);
110  libxml_disable_entity_loader($disable_load_entities);
111  return false;
112  }
113 
114  libxml_use_internal_errors($use_internal_errors);
115  libxml_disable_entity_loader($disable_load_entities);
116 
117  $this->_xpath = new DOMXPath($this->_doc);
118 
119  // start processing recursively at the BODY element
120  $nodeList = $this->_xpath->query('//body[1]');
121  if ($nodeList->item(0) instanceof DOMText) {
122  // May be https://github.com/facebook/hhvm/issues/7745
123  // Um... try again?
124  $this->_xpath = new DOMXPath($this->_doc);
125  $nodeList = $this->_xpath->query('//body[1]');
126 
127  if ($nodeList->item(0) instanceof DOMText) {
128  // not going to work
129  throw new \RuntimeException('DOMXPath::query for BODY element returned a text node');
130  }
131  }
132  $this->addParagraphs($nodeList->item(0));
133 
134  // serialize back to HTML
135  $html = $this->_doc->saveHTML();
136 
137  // Note: we create <autop> elements, which will later be converted to paragraphs
138 
139  // split AUTOPs into multiples at /\n\n+/
140  $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
141  $html = str_replace([$this->_unique . 'BR', $this->_unique . 'NL', '<br>'],
142  '<br />',
143  $html);
144  $html = str_replace('<br /></autop>', '</autop>', $html);
145 
146  // re-parse so we can handle new AUTOP elements
147 
148  // parse to DOM, suppressing loadHTML warnings
149  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
150  $use_internal_errors = libxml_use_internal_errors(true);
151 
152  // Do not load entities. May be unnecessary, better safe than sorry
153  $disable_load_entities = libxml_disable_entity_loader(true);
154 
155  if (!$this->_doc->loadHTML($html)) {
156  libxml_use_internal_errors($use_internal_errors);
157  libxml_disable_entity_loader($disable_load_entities);
158  return false;
159  }
160 
161  libxml_use_internal_errors($use_internal_errors);
162  libxml_disable_entity_loader($disable_load_entities);
163 
164  // must re-create XPath object after DOM load
165  $this->_xpath = new DOMXPath($this->_doc);
166 
167  // strip AUTOPs that only have comments/whitespace
168  foreach ($this->_xpath->query('//autop') as $autop) {
169  /* @var DOMElement $autop */
170  $hasContent = false;
171  if (trim($autop->textContent) !== '') {
172  $hasContent = true;
173  } else {
174  foreach ($autop->childNodes as $node) {
175  if ($node->nodeType === XML_ELEMENT_NODE) {
176  $hasContent = true;
177  break;
178  }
179  }
180  }
181  if (!$hasContent) {
182  // mark to be later replaced w/ preg_replace (faster than moving nodes out)
183  $autop->setAttribute("r", "1");
184  }
185  }
186 
187  // If a DIV contains a single AUTOP, remove it
188  foreach ($this->_xpath->query('//div') as $el) {
189  /* @var DOMElement $el */
190  $autops = $this->_xpath->query('./autop', $el);
191  if ($autops->length === 1) {
192  $firstAutop = $autops->item(0);
193  /* @var DOMElement $firstAutop */
194  $firstAutop->setAttribute("r", "1");
195  }
196  }
197 
198  $html = $this->_doc->saveHTML();
199 
200  // trim to the contents of BODY
201  $bodyStart = strpos($html, '<body>');
202  $bodyEnd = strpos($html, '</body>', $bodyStart + 6);
203  $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
204 
205  // strip AUTOPs that should be removed
206  $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
207 
208  // commit to converting AUTOPs to Ps
209  $html = str_replace('<autop>', "\n<p>", $html);
210  $html = str_replace('</autop>', "</p>\n", $html);
211 
212  $html = str_replace('<br>', '<br />', $html);
213  $html = str_replace($this->_unique . 'AMP', '&', $html);
214  return $html;
215  }
216 
223  protected function addParagraphs(DOMElement $el) {
224  // no need to call recursively, just queue up
225  $elsToProcess = [$el];
226  $inlinesToProcess = [];
227  while ($el = array_shift($elsToProcess)) {
228  // if true, we can alter all child nodes, if not, we'll just call
229  // addParagraphs on each element in the descendInto list
230  $alterInline = in_array($el->nodeName, $this->_alterList);
231 
232  // inside affected elements, we want to trim leading whitespace from
233  // the first text node
234  $ltrimFirstTextNode = true;
235 
236  // should we open a new AUTOP element to move inline elements into?
237  $openP = true;
238  $autop = null;
239 
240  // after BR, ignore a newline
241  $isFollowingBr = false;
242 
243  $node = $el->firstChild;
244  while (null !== $node) {
245  if ($alterInline) {
246  if ($openP) {
247  $openP = false;
248  // create a P to move inline content into (this may be removed later)
249  $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
250  }
251  }
252 
253  $isElement = ($node->nodeType === XML_ELEMENT_NODE);
254  if ($isElement) {
255  $isBlock = in_array($node->nodeName, $this->_blocks);
256  if (!$isBlock) {
257  // if we start with an inline element we don't need to do this
258  $ltrimFirstTextNode = false;
259  }
260  } else {
261  $isBlock = false;
262  }
263 
264  if ($alterInline) {
265  $isText = ($node->nodeType === XML_TEXT_NODE);
266  $isLastInline = (! $node->nextSibling
267  || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
268  && in_array($node->nextSibling->nodeName, $this->_blocks)));
269  if ($isElement) {
270  $isFollowingBr = ($node->nodeName === 'br');
271  }
272 
273  if ($isText) {
274  $nodeText = $node->nodeValue;
275 
276  if ($ltrimFirstTextNode) {
277  // we're at the beginning of a sequence of text/inline elements
278  $nodeText = ltrim($nodeText);
279  $ltrimFirstTextNode = false;
280  }
281  if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
282  // if a user ends a line with <br>, don't add a second BR
283  $nodeText = substr($nodeText, strlen($m[0]));
284  }
285  if ($isLastInline) {
286  // we're at the end of a sequence of text/inline elements
287  $nodeText = rtrim($nodeText);
288  }
289  $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
290  $tmpNode = $node;
291  $node = $node->nextSibling; // move loop to next node
292 
293  // alter node in place, then move into AUTOP
294  $tmpNode->nodeValue = $nodeText;
295  $autop->appendChild($tmpNode);
296 
297  continue;
298  }
299  }
300  if ($isBlock || ! $node->nextSibling) {
301  if ($isBlock) {
302  if (in_array($node->nodeName, $this->_descendList)) {
303  $elsToProcess[] = $node;
304  //$this->addParagraphs($node);
305  }
306  }
307  $openP = true;
308  $ltrimFirstTextNode = true;
309  }
310  if ($alterInline) {
311  if (! $isBlock) {
312  $tmpNode = $node;
313  if ($isElement && false !== strpos($tmpNode->textContent, "\n")) {
314  $inlinesToProcess[] = $tmpNode;
315  }
316  $node = $node->nextSibling;
317  $autop->appendChild($tmpNode);
318  continue;
319  }
320  }
321 
322  $node = $node->nextSibling;
323  }
324  }
325 
326  // handle inline nodes
327  // no need to recurse, just queue up
328  while ($el = array_shift($inlinesToProcess)) {
329  $ignoreLeadingNewline = false;
330  foreach ($el->childNodes as $node) {
331  if ($node->nodeType === XML_ELEMENT_NODE) {
332  if ($node->nodeValue === 'BR') {
333  $ignoreLeadingNewline = true;
334  } else {
335  $ignoreLeadingNewline = false;
336  if (false !== strpos($node->textContent, "\n")) {
337  $inlinesToProcess[] = $node;
338  }
339  }
340  continue;
341  } elseif ($node->nodeType === XML_TEXT_NODE) {
342  $text = $node->nodeValue;
343  if ($text[0] === "\n" && $ignoreLeadingNewline) {
344  $text = substr($text, 1);
345  $ignoreLeadingNewline = false;
346  }
347  $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
348  }
349  }
350  }
351  }
352 }
process($html)
Create wrapper P and BR elements in HTML depending on newlines.
Definition: ElggAutoP.php:90
$html
Definition: section.php:10
$text
Definition: default.php:28
if($item instanceof\ElggEntity) elseif($item instanceof\ElggRiverItem) elseif(is_callable([$item, 'getType']))
Definition: item.php:39
addParagraphs(DOMElement $el)
Add P and BR elements as necessary.
Definition: ElggAutoP.php:223
__construct()
Constructor.
Definition: ElggAutoP.php:71