Elgg  Version master
AutoParagraph.php
Go to the documentation of this file.
1 <?php
2 
3 namespace Elgg\Views;
4 
6 
16 
20  public $encoding = 'UTF-8';
21 
25  protected $_doc = null;
26 
30  protected $_xpath = null;
31 
35  protected $_blocks = 'address article area aside blockquote caption col colgroup dd
36  details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
37  hr hgroup legend map math menu nav noscript p pre section select style summary
38  table tbody td tfoot th thead tr ul ol option li';
39 
43  protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
44  del dfn em embed i iframe img input ins kbd keygen label map mark meter object
45  output progress q rp rt ruby s samp script select small source span strong style
46  sub sup textarea time var video wbr';
47 
53  protected $_descendList = 'article aside blockquote body details div footer form
54  header section';
55 
61  protected $_alterList = 'article aside blockquote body details div footer header
62  section';
63 
67  protected $_unique = '';
68 
72  public function __construct() {
73  $this->_blocks = preg_split('@\\s+@', $this->_blocks);
74  $this->_descendList = preg_split('@\\s+@', $this->_descendList);
75  $this->_alterList = preg_split('@\\s+@', $this->_alterList);
76  $this->_inlines = preg_split('@\\s+@', $this->_inlines);
77  $this->_unique = md5(__FILE__);
78  }
79 
93  public function process($html) {
94  if (!isset($html)) {
95  return '';
96  }
97 
98  // normalize whitespace
99  $html = str_replace(["\r\n", "\r"], "\n", $html);
100 
101  // allows preserving entities untouched
102  $html = str_replace('&', $this->_unique . 'AMP', $html);
103 
104  $this->_doc = new \DOMDocument();
105 
106  // parse to DOM, suppressing loadHTML warnings
107  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
108  $use_internal_errors = libxml_use_internal_errors(true);
109 
110  $load_result = $this->_doc->loadHTML("<html><meta http-equiv='content-type' content='text/html; charset={$this->encoding}'><body>{$html}</body></html>", LIBXML_NOBLANKS);
111  // restore warnings
112  libxml_use_internal_errors($use_internal_errors);
113  if (!$load_result) {
114  return false;
115  }
116 
117  $this->_xpath = new \DOMXPath($this->_doc);
118 
119  // start processing recursively at the BODY element
120  $nodeList = $this->_xpath->query('//body[1]');
121  if ($nodeList->item(0) instanceof \DOMText) {
122  // May be https://github.com/facebook/hhvm/issues/7745
123  // Um... try again?
124  $this->_xpath = new \DOMXPath($this->_doc);
125  $nodeList = $this->_xpath->query('//body[1]');
126 
127  if ($nodeList->item(0) instanceof \DOMText) {
128  // not going to work
129  throw new RuntimeException('DOMXPath::query for BODY element returned a text node');
130  }
131  }
132 
133  $this->addParagraphs($nodeList->item(0));
134 
135  // serialize back to HTML
136  $html = $this->_doc->saveHTML();
137 
138  // Note: we create <autop> elements, which will later be converted to paragraphs
139 
140  // split AUTOPs into multiples at /\n\n+/
141  $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
142  $html = str_replace([$this->_unique . 'BR', $this->_unique . 'NL', '<br>'],
143  '<br />',
144  $html);
145  $html = str_replace('<br /></autop>', '</autop>', $html);
146 
147  // re-parse so we can handle new AUTOP elements
148 
149  // parse to DOM, suppressing loadHTML warnings
150  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
151  $use_internal_errors = libxml_use_internal_errors(true);
152 
153  $load_result = $this->_doc->loadHTML($html);
154  // restore warnings
155  libxml_clear_errors();
156  libxml_use_internal_errors($use_internal_errors);
157  if (!$load_result) {
158  return false;
159  }
160 
161  // must re-create XPath object after DOM load
162  $this->_xpath = new \DOMXPath($this->_doc);
163 
164  // strip AUTOPs that only have comments/whitespace
165  foreach ($this->_xpath->query('//autop') as $autop) {
166  /* @var \DOMElement $autop */
167  $hasContent = false;
168  if (trim($autop->textContent) !== '') {
169  $hasContent = true;
170  } else {
171  foreach ($autop->childNodes as $node) {
172  if ($node->nodeType === XML_ELEMENT_NODE) {
173  $hasContent = true;
174  break;
175  }
176  }
177  }
178 
179  if (!$hasContent) {
180  // mark to be later replaced w/ preg_replace (faster than moving nodes out)
181  $autop->setAttribute('r', '1');
182  }
183  }
184 
185  // If a DIV contains a single AUTOP, remove it
186  foreach ($this->_xpath->query('//div') as $el) {
187  /* @var \DOMElement $el */
188  $autops = $this->_xpath->query('./autop', $el);
189  if ($autops->length === 1) {
190  $firstAutop = $autops->item(0);
191  /* @var \DOMElement $firstAutop */
192  $firstAutop->setAttribute('r', '1');
193  }
194  }
195 
196  $html = $this->_doc->saveHTML();
197 
198  // trim to the contents of BODY
199  $bodyStart = elgg_strpos($html, '<body>');
200  $bodyEnd = elgg_strpos($html, '</body>', $bodyStart + 6);
201  $html = elgg_substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
202 
203  // strip AUTOPs that should be removed
204  $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
205 
206  // commit to converting AUTOPs to Ps
207  $html = str_replace('<autop>', "\n<p>", $html);
208  $html = str_replace('</autop>', "</p>\n", $html);
209 
210  $html = str_replace('<br>', '<br />', $html);
211  return str_replace($this->_unique . 'AMP', '&', $html);
212  }
213 
221  protected function addParagraphs(\DOMElement $el) {
222  // no need to call recursively, just queue up
223  $elsToProcess = [$el];
224  $inlinesToProcess = [];
225  while ($el = array_shift($elsToProcess)) {
226  // if true, we can alter all child nodes, if not, we'll just call
227  // addParagraphs on each element in the descendInto list
228  $alterInline = in_array($el->nodeName, $this->_alterList);
229 
230  // inside affected elements, we want to trim leading whitespace from
231  // the first text node
232  $ltrimFirstTextNode = true;
233 
234  // should we open a new AUTOP element to move inline elements into?
235  $openP = true;
236  $autop = null;
237 
238  // after BR, ignore a newline
239  $isFollowingBr = false;
240 
241  $node = $el->firstChild;
242  while (isset($node)) {
243  if ($alterInline) {
244  if ($openP) {
245  $openP = false;
246  // create a P to move inline content into (this may be removed later)
247  $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
248  }
249  }
250 
251  $isElement = ($node->nodeType === XML_ELEMENT_NODE);
252  if ($isElement) {
253  $isBlock = in_array($node->nodeName, $this->_blocks);
254  if (!$isBlock) {
255  // if we start with an inline element we don't need to do this
256  $ltrimFirstTextNode = false;
257  }
258  } else {
259  $isBlock = false;
260  }
261 
262  if ($alterInline) {
263  $isText = ($node->nodeType === XML_TEXT_NODE);
264  $isLastInline = (!$node->nextSibling
265  || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
266  && in_array($node->nextSibling->nodeName, $this->_blocks)));
267  if ($isElement) {
268  $isFollowingBr = ($node->nodeName === 'br');
269  }
270 
271  if ($isText) {
272  $nodeText = $node->nodeValue;
273 
274  if ($ltrimFirstTextNode) {
275  // we're at the beginning of a sequence of text/inline elements
276  $nodeText = ltrim($nodeText);
277  $ltrimFirstTextNode = false;
278  }
279 
280  $matches = [];
281  if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $matches)) {
282  // if a user ends a line with <br>, don't add a second BR
283  $nodeText = elgg_substr($nodeText, elgg_strlen($matches[0]));
284  }
285 
286  if ($isLastInline) {
287  // we're at the end of a sequence of text/inline elements
288  $nodeText = rtrim($nodeText);
289  }
290 
291  $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
292  $tmpNode = $node;
293  $node = $node->nextSibling; // move loop to next node
294 
295  // alter node in place, then move into AUTOP
296  $tmpNode->nodeValue = $nodeText;
297  $autop->appendChild($tmpNode);
298 
299  continue;
300  }
301  }
302 
303  if ($isBlock || !$node->nextSibling) {
304  if ($isBlock) {
305  if (in_array($node->nodeName, $this->_descendList)) {
306  $elsToProcess[] = $node;
307  //$this->addParagraphs($node);
308  }
309  }
310 
311  $openP = true;
312  $ltrimFirstTextNode = true;
313  }
314 
315  if ($alterInline) {
316  if (!$isBlock) {
317  $tmpNode = $node;
318  if ($isElement && elgg_strpos($tmpNode->textContent, "\n") !== false) {
319  $inlinesToProcess[] = $tmpNode;
320  }
321 
322  $node = $node->nextSibling;
323  $autop->appendChild($tmpNode);
324  continue;
325  }
326  }
327 
328  $node = $node->nextSibling;
329  }
330  }
331 
332  // handle inline nodes
333  // no need to recurse, just queue up
334  while ($el = array_shift($inlinesToProcess)) {
335  $ignoreLeadingNewline = false;
336  foreach ($el->childNodes as $node) {
337  if ($node->nodeType === XML_ELEMENT_NODE) {
338  if ($node->nodeValue === 'BR') {
339  $ignoreLeadingNewline = true;
340  } else {
341  $ignoreLeadingNewline = false;
342  if (elgg_strpos($node->textContent, "\n") !== false) {
343  $inlinesToProcess[] = $node;
344  }
345  }
346  } elseif ($node->nodeType === XML_TEXT_NODE) {
347  $text = $node->nodeValue;
348  if ($text[0] === "\n" && $ignoreLeadingNewline) {
349  $text = substr($text, 1);
350  $ignoreLeadingNewline = false;
351  }
352 
353  $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
354  }
355  }
356  }
357  }
358 }
__construct()
Constructor.
Exception thrown if an error which can only be found on runtime occurs.
Create wrapper P and BR elements in HTML depending on newlines.
if($item instanceof\ElggEntity) elseif($item instanceof\ElggRiverItem) elseif($item instanceof\ElggRelationship) elseif(is_callable([$item, 'getType']))
Definition: item.php:48
elgg_strlen()
Wrapper function for mb_strlen().
Definition: mb_wrapper.php:53
$html
A wrapper to render a section of the page shell.
Definition: section.php:9
elgg_strpos()
Wrapper function for mb_strpos().
Definition: mb_wrapper.php:71
addParagraphs(\DOMElement $el)
Add P and BR elements as necessary.
elgg_substr()
Wrapper function for mb_substr().
Definition: mb_wrapper.php:194
$text
Definition: button.php:33
process($html)
Create wrapper P and BR elements in HTML depending on newlines.