Elgg  Version 5.1
AutoParagraph.php
Go to the documentation of this file.
1 <?php
2 
3 namespace Elgg\Views;
4 
6 
16 
20  public $encoding = 'UTF-8';
21 
25  protected $_doc = null;
26 
30  protected $_xpath = null;
31 
35  protected $_blocks = 'address article area aside blockquote caption col colgroup dd
36  details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
37  hr hgroup legend map math menu nav noscript p pre section select style summary
38  table tbody td tfoot th thead tr ul ol option li';
39 
43  protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
44  del dfn em embed i iframe img input ins kbd keygen label map mark meter object
45  output progress q rp rt ruby s samp script select small source span strong style
46  sub sup textarea time var video wbr';
47 
53  protected $_descendList = 'article aside blockquote body details div footer form
54  header section';
55 
61  protected $_alterList = 'article aside blockquote body details div footer header
62  section';
63 
67  protected $_unique = '';
68 
72  public function __construct() {
73  $this->_blocks = preg_split('@\\s+@', $this->_blocks);
74  $this->_descendList = preg_split('@\\s+@', $this->_descendList);
75  $this->_alterList = preg_split('@\\s+@', $this->_alterList);
76  $this->_inlines = preg_split('@\\s+@', $this->_inlines);
77  $this->_unique = md5(__FILE__);
78  }
79 
93  public function process($html) {
94  if (!isset($html)) {
95  return '';
96  }
97 
98  // normalize whitespace
99  $html = str_replace(["\r\n", "\r"], "\n", $html);
100 
101  // allows preserving entities untouched
102  $html = str_replace('&', $this->_unique . 'AMP', $html);
103 
104  $this->_doc = new \DOMDocument();
105 
106  // parse to DOM, suppressing loadHTML warnings
107  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
108  $use_internal_errors = libxml_use_internal_errors(true);
109 
110  $load_result = $this->_doc->loadHTML("<html><meta http-equiv='content-type' content='text/html; charset={$this->encoding}'><body>{$html}</body></html>", LIBXML_NOBLANKS);
111  // restore warnings
112  libxml_use_internal_errors($use_internal_errors);
113  if (!$load_result) {
114  return false;
115  }
116 
117  $this->_xpath = new \DOMXPath($this->_doc);
118 
119  // start processing recursively at the BODY element
120  $nodeList = $this->_xpath->query('//body[1]');
121  if ($nodeList->item(0) instanceof \DOMText) {
122  // May be https://github.com/facebook/hhvm/issues/7745
123  // Um... try again?
124  $this->_xpath = new \DOMXPath($this->_doc);
125  $nodeList = $this->_xpath->query('//body[1]');
126 
127  if ($nodeList->item(0) instanceof \DOMText) {
128  // not going to work
129  throw new RuntimeException('DOMXPath::query for BODY element returned a text node');
130  }
131  }
132 
133  $this->addParagraphs($nodeList->item(0));
134 
135  // serialize back to HTML
136  $html = $this->_doc->saveHTML();
137 
138  // Note: we create <autop> elements, which will later be converted to paragraphs
139 
140  // split AUTOPs into multiples at /\n\n+/
141  $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
142  $html = str_replace([$this->_unique . 'BR', $this->_unique . 'NL', '<br>'],
143  '<br />',
144  $html);
145  $html = str_replace('<br /></autop>', '</autop>', $html);
146 
147  // re-parse so we can handle new AUTOP elements
148 
149  // parse to DOM, suppressing loadHTML warnings
150  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
151  $use_internal_errors = libxml_use_internal_errors(true);
152 
153  $load_result = $this->_doc->loadHTML($html);
154  // restore warnings
155  libxml_use_internal_errors($use_internal_errors);
156  if (!$load_result) {
157  return false;
158  }
159 
160  // must re-create XPath object after DOM load
161  $this->_xpath = new \DOMXPath($this->_doc);
162 
163  // strip AUTOPs that only have comments/whitespace
164  foreach ($this->_xpath->query('//autop') as $autop) {
165  /* @var \DOMElement $autop */
166  $hasContent = false;
167  if (trim($autop->textContent) !== '') {
168  $hasContent = true;
169  } else {
170  foreach ($autop->childNodes as $node) {
171  if ($node->nodeType === XML_ELEMENT_NODE) {
172  $hasContent = true;
173  break;
174  }
175  }
176  }
177 
178  if (!$hasContent) {
179  // mark to be later replaced w/ preg_replace (faster than moving nodes out)
180  $autop->setAttribute('r', '1');
181  }
182  }
183 
184  // If a DIV contains a single AUTOP, remove it
185  foreach ($this->_xpath->query('//div') as $el) {
186  /* @var \DOMElement $el */
187  $autops = $this->_xpath->query('./autop', $el);
188  if ($autops->length === 1) {
189  $firstAutop = $autops->item(0);
190  /* @var \DOMElement $firstAutop */
191  $firstAutop->setAttribute('r', '1');
192  }
193  }
194 
195  $html = $this->_doc->saveHTML();
196 
197  // trim to the contents of BODY
198  $bodyStart = elgg_strpos($html, '<body>');
199  $bodyEnd = elgg_strpos($html, '</body>', $bodyStart + 6);
200  $html = elgg_substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
201 
202  // strip AUTOPs that should be removed
203  $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
204 
205  // commit to converting AUTOPs to Ps
206  $html = str_replace('<autop>', "\n<p>", $html);
207  $html = str_replace('</autop>', "</p>\n", $html);
208 
209  $html = str_replace('<br>', '<br />', $html);
210  return str_replace($this->_unique . 'AMP', '&', $html);
211  }
212 
220  protected function addParagraphs(\DOMElement $el) {
221  // no need to call recursively, just queue up
222  $elsToProcess = [$el];
223  $inlinesToProcess = [];
224  while ($el = array_shift($elsToProcess)) {
225  // if true, we can alter all child nodes, if not, we'll just call
226  // addParagraphs on each element in the descendInto list
227  $alterInline = in_array($el->nodeName, $this->_alterList);
228 
229  // inside affected elements, we want to trim leading whitespace from
230  // the first text node
231  $ltrimFirstTextNode = true;
232 
233  // should we open a new AUTOP element to move inline elements into?
234  $openP = true;
235  $autop = null;
236 
237  // after BR, ignore a newline
238  $isFollowingBr = false;
239 
240  $node = $el->firstChild;
241  while (isset($node)) {
242  if ($alterInline) {
243  if ($openP) {
244  $openP = false;
245  // create a P to move inline content into (this may be removed later)
246  $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
247  }
248  }
249 
250  $isElement = ($node->nodeType === XML_ELEMENT_NODE);
251  if ($isElement) {
252  $isBlock = in_array($node->nodeName, $this->_blocks);
253  if (!$isBlock) {
254  // if we start with an inline element we don't need to do this
255  $ltrimFirstTextNode = false;
256  }
257  } else {
258  $isBlock = false;
259  }
260 
261  if ($alterInline) {
262  $isText = ($node->nodeType === XML_TEXT_NODE);
263  $isLastInline = (!$node->nextSibling
264  || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
265  && in_array($node->nextSibling->nodeName, $this->_blocks)));
266  if ($isElement) {
267  $isFollowingBr = ($node->nodeName === 'br');
268  }
269 
270  if ($isText) {
271  $nodeText = $node->nodeValue;
272 
273  if ($ltrimFirstTextNode) {
274  // we're at the beginning of a sequence of text/inline elements
275  $nodeText = ltrim($nodeText);
276  $ltrimFirstTextNode = false;
277  }
278 
279  $matches = [];
280  if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $matches)) {
281  // if a user ends a line with <br>, don't add a second BR
282  $nodeText = elgg_substr($nodeText, elgg_strlen($matches[0]));
283  }
284 
285  if ($isLastInline) {
286  // we're at the end of a sequence of text/inline elements
287  $nodeText = rtrim($nodeText);
288  }
289 
290  $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
291  $tmpNode = $node;
292  $node = $node->nextSibling; // move loop to next node
293 
294  // alter node in place, then move into AUTOP
295  $tmpNode->nodeValue = $nodeText;
296  $autop->appendChild($tmpNode);
297 
298  continue;
299  }
300  }
301 
302  if ($isBlock || !$node->nextSibling) {
303  if ($isBlock) {
304  if (in_array($node->nodeName, $this->_descendList)) {
305  $elsToProcess[] = $node;
306  //$this->addParagraphs($node);
307  }
308  }
309 
310  $openP = true;
311  $ltrimFirstTextNode = true;
312  }
313 
314  if ($alterInline) {
315  if (!$isBlock) {
316  $tmpNode = $node;
317  if ($isElement && elgg_strpos($tmpNode->textContent, "\n") !== false) {
318  $inlinesToProcess[] = $tmpNode;
319  }
320 
321  $node = $node->nextSibling;
322  $autop->appendChild($tmpNode);
323  continue;
324  }
325  }
326 
327  $node = $node->nextSibling;
328  }
329  }
330 
331  // handle inline nodes
332  // no need to recurse, just queue up
333  while ($el = array_shift($inlinesToProcess)) {
334  $ignoreLeadingNewline = false;
335  foreach ($el->childNodes as $node) {
336  if ($node->nodeType === XML_ELEMENT_NODE) {
337  if ($node->nodeValue === 'BR') {
338  $ignoreLeadingNewline = true;
339  } else {
340  $ignoreLeadingNewline = false;
341  if (elgg_strpos($node->textContent, "\n") !== false) {
342  $inlinesToProcess[] = $node;
343  }
344  }
345  } elseif ($node->nodeType === XML_TEXT_NODE) {
346  $text = $node->nodeValue;
347  if ($text[0] === "\n" && $ignoreLeadingNewline) {
348  $text = substr($text, 1);
349  $ignoreLeadingNewline = false;
350  }
351 
352  $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
353  }
354  }
355  }
356  }
357 }
__construct()
Constructor.
Exception thrown if an error which can only be found on runtime occurs.
Create wrapper P and BR elements in HTML depending on newlines.
if($item instanceof\ElggEntity) elseif($item instanceof\ElggRiverItem) elseif($item instanceof\ElggRelationship) elseif(is_callable([$item, 'getType']))
Definition: item.php:48
elgg_strlen()
Wrapper function for mb_strlen().
Definition: mb_wrapper.php:53
$html
Definition: section.php:10
elgg_strpos()
Wrapper function for mb_strpos().
Definition: mb_wrapper.php:71
addParagraphs(\DOMElement $el)
Add P and BR elements as necessary.
elgg_substr()
Wrapper function for mb_substr().
Definition: mb_wrapper.php:230
$text
Definition: button.php:33
process($html)
Create wrapper P and BR elements in HTML depending on newlines.