Elgg  Version 4.x
ElggAutoP.php
Go to the documentation of this file.
1 <?php
2 
11 class ElggAutoP {
12 
16  public $encoding = 'UTF-8';
17 
21  protected $_doc = null;
22 
26  protected $_xpath = null;
27 
31  protected $_blocks = 'address article area aside blockquote caption col colgroup dd
32  details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
33  hr hgroup legend map math menu nav noscript p pre section select style summary
34  table tbody td tfoot th thead tr ul ol option li';
35 
39  protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
40  del dfn em embed i iframe img input ins kbd keygen label map mark meter object
41  output progress q rp rt ruby s samp script select small source span strong style
42  sub sup textarea time var video wbr';
43 
49  protected $_descendList = 'article aside blockquote body details div footer form
50  header section';
51 
57  protected $_alterList = 'article aside blockquote body details div footer header
58  section';
59 
63  protected $_unique = '';
64 
68  public function __construct() {
69  $this->_blocks = preg_split('@\\s+@', $this->_blocks);
70  $this->_descendList = preg_split('@\\s+@', $this->_descendList);
71  $this->_alterList = preg_split('@\\s+@', $this->_alterList);
72  $this->_inlines = preg_split('@\\s+@', $this->_inlines);
73  $this->_unique = md5(__FILE__);
74  }
75 
87  public function process($html) {
88  if (!isset($html)) {
89  return '';
90  }
91 
92  // normalize whitespace
93  $html = str_replace(["\r\n", "\r"], "\n", $html);
94 
95  // allows preserving entities untouched
96  $html = str_replace('&', $this->_unique . 'AMP', $html);
97 
98  $this->_doc = new DOMDocument();
99 
100  // parse to DOM, suppressing loadHTML warnings
101  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
102  $use_internal_errors = libxml_use_internal_errors(true);
103 
104  if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' "
105  . "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
106  . "</html>", LIBXML_NOBLANKS)) {
107  libxml_use_internal_errors($use_internal_errors);
108  return false;
109  }
110 
111  libxml_use_internal_errors($use_internal_errors);
112 
113  $this->_xpath = new DOMXPath($this->_doc);
114 
115  // start processing recursively at the BODY element
116  $nodeList = $this->_xpath->query('//body[1]');
117  if ($nodeList->item(0) instanceof DOMText) {
118  // May be https://github.com/facebook/hhvm/issues/7745
119  // Um... try again?
120  $this->_xpath = new DOMXPath($this->_doc);
121  $nodeList = $this->_xpath->query('//body[1]');
122 
123  if ($nodeList->item(0) instanceof DOMText) {
124  // not going to work
125  throw new \RuntimeException('DOMXPath::query for BODY element returned a text node');
126  }
127  }
128  $this->addParagraphs($nodeList->item(0));
129 
130  // serialize back to HTML
131  $html = $this->_doc->saveHTML();
132 
133  // Note: we create <autop> elements, which will later be converted to paragraphs
134 
135  // split AUTOPs into multiples at /\n\n+/
136  $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
137  $html = str_replace([$this->_unique . 'BR', $this->_unique . 'NL', '<br>'],
138  '<br />',
139  $html);
140  $html = str_replace('<br /></autop>', '</autop>', $html);
141 
142  // re-parse so we can handle new AUTOP elements
143 
144  // parse to DOM, suppressing loadHTML warnings
145  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
146  $use_internal_errors = libxml_use_internal_errors(true);
147 
148  if (!$this->_doc->loadHTML($html)) {
149  libxml_use_internal_errors($use_internal_errors);
150  return false;
151  }
152 
153  libxml_use_internal_errors($use_internal_errors);
154 
155  // must re-create XPath object after DOM load
156  $this->_xpath = new DOMXPath($this->_doc);
157 
158  // strip AUTOPs that only have comments/whitespace
159  foreach ($this->_xpath->query('//autop') as $autop) {
160  /* @var DOMElement $autop */
161  $hasContent = false;
162  if (trim($autop->textContent) !== '') {
163  $hasContent = true;
164  } else {
165  foreach ($autop->childNodes as $node) {
166  if ($node->nodeType === XML_ELEMENT_NODE) {
167  $hasContent = true;
168  break;
169  }
170  }
171  }
172  if (!$hasContent) {
173  // mark to be later replaced w/ preg_replace (faster than moving nodes out)
174  $autop->setAttribute("r", "1");
175  }
176  }
177 
178  // If a DIV contains a single AUTOP, remove it
179  foreach ($this->_xpath->query('//div') as $el) {
180  /* @var DOMElement $el */
181  $autops = $this->_xpath->query('./autop', $el);
182  if ($autops->length === 1) {
183  $firstAutop = $autops->item(0);
184  /* @var DOMElement $firstAutop */
185  $firstAutop->setAttribute("r", "1");
186  }
187  }
188 
189  $html = $this->_doc->saveHTML();
190 
191  // trim to the contents of BODY
192  $bodyStart = elgg_strpos($html, '<body>');
193  $bodyEnd = elgg_strpos($html, '</body>', $bodyStart + 6);
194  $html = elgg_substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
195 
196  // strip AUTOPs that should be removed
197  $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
198 
199  // commit to converting AUTOPs to Ps
200  $html = str_replace('<autop>', "\n<p>", $html);
201  $html = str_replace('</autop>', "</p>\n", $html);
202 
203  $html = str_replace('<br>', '<br />', $html);
204  $html = str_replace($this->_unique . 'AMP', '&', $html);
205  return $html;
206  }
207 
214  protected function addParagraphs(DOMElement $el) {
215  // no need to call recursively, just queue up
216  $elsToProcess = [$el];
217  $inlinesToProcess = [];
218  while ($el = array_shift($elsToProcess)) {
219  // if true, we can alter all child nodes, if not, we'll just call
220  // addParagraphs on each element in the descendInto list
221  $alterInline = in_array($el->nodeName, $this->_alterList);
222 
223  // inside affected elements, we want to trim leading whitespace from
224  // the first text node
225  $ltrimFirstTextNode = true;
226 
227  // should we open a new AUTOP element to move inline elements into?
228  $openP = true;
229  $autop = null;
230 
231  // after BR, ignore a newline
232  $isFollowingBr = false;
233 
234  $node = $el->firstChild;
235  while (null !== $node) {
236  if ($alterInline) {
237  if ($openP) {
238  $openP = false;
239  // create a P to move inline content into (this may be removed later)
240  $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
241  }
242  }
243 
244  $isElement = ($node->nodeType === XML_ELEMENT_NODE);
245  if ($isElement) {
246  $isBlock = in_array($node->nodeName, $this->_blocks);
247  if (!$isBlock) {
248  // if we start with an inline element we don't need to do this
249  $ltrimFirstTextNode = false;
250  }
251  } else {
252  $isBlock = false;
253  }
254 
255  if ($alterInline) {
256  $isText = ($node->nodeType === XML_TEXT_NODE);
257  $isLastInline = (! $node->nextSibling
258  || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
259  && in_array($node->nextSibling->nodeName, $this->_blocks)));
260  if ($isElement) {
261  $isFollowingBr = ($node->nodeName === 'br');
262  }
263 
264  if ($isText) {
265  $nodeText = $node->nodeValue;
266 
267  if ($ltrimFirstTextNode) {
268  // we're at the beginning of a sequence of text/inline elements
269  $nodeText = ltrim($nodeText);
270  $ltrimFirstTextNode = false;
271  }
272  if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
273  // if a user ends a line with <br>, don't add a second BR
274  $nodeText = elgg_substr($nodeText, elgg_strlen($m[0]));
275  }
276  if ($isLastInline) {
277  // we're at the end of a sequence of text/inline elements
278  $nodeText = rtrim($nodeText);
279  }
280  $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
281  $tmpNode = $node;
282  $node = $node->nextSibling; // move loop to next node
283 
284  // alter node in place, then move into AUTOP
285  $tmpNode->nodeValue = $nodeText;
286  $autop->appendChild($tmpNode);
287 
288  continue;
289  }
290  }
291  if ($isBlock || ! $node->nextSibling) {
292  if ($isBlock) {
293  if (in_array($node->nodeName, $this->_descendList)) {
294  $elsToProcess[] = $node;
295  //$this->addParagraphs($node);
296  }
297  }
298  $openP = true;
299  $ltrimFirstTextNode = true;
300  }
301  if ($alterInline) {
302  if (! $isBlock) {
303  $tmpNode = $node;
304  if ($isElement && false !== elgg_strpos($tmpNode->textContent, "\n")) {
305  $inlinesToProcess[] = $tmpNode;
306  }
307  $node = $node->nextSibling;
308  $autop->appendChild($tmpNode);
309  continue;
310  }
311  }
312 
313  $node = $node->nextSibling;
314  }
315  }
316 
317  // handle inline nodes
318  // no need to recurse, just queue up
319  while ($el = array_shift($inlinesToProcess)) {
320  $ignoreLeadingNewline = false;
321  foreach ($el->childNodes as $node) {
322  if ($node->nodeType === XML_ELEMENT_NODE) {
323  if ($node->nodeValue === 'BR') {
324  $ignoreLeadingNewline = true;
325  } else {
326  $ignoreLeadingNewline = false;
327  if (false !== elgg_strpos($node->textContent, "\n")) {
328  $inlinesToProcess[] = $node;
329  }
330  }
331  continue;
332  } elseif ($node->nodeType === XML_TEXT_NODE) {
333  $text = $node->nodeValue;
334  if ($text[0] === "\n" && $ignoreLeadingNewline) {
335  $text = substr($text, 1);
336  $ignoreLeadingNewline = false;
337  }
338  $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
339  }
340  }
341  }
342  }
343 }
process($html)
Create wrapper P and BR elements in HTML depending on newlines.
Definition: ElggAutoP.php:87
elgg_strlen()
Wrapper function for mb_strlen().
Definition: mb_wrapper.php:52
$html
Definition: section.php:10
elgg_strpos()
Wrapper function for mb_strpos().
Definition: mb_wrapper.php:69
elgg_substr()
Wrapper function for mb_substr().
Definition: mb_wrapper.php:219
if($item instanceof\ElggEntity) elseif($item instanceof\ElggRiverItem) elseif($item instanceof ElggRelationship) elseif(is_callable([$item, 'getType']))
Definition: item.php:48
addParagraphs(DOMElement $el)
Add P and BR elements as necessary.
Definition: ElggAutoP.php:214
Create wrapper P and BR elements in HTML depending on newlines.
Definition: ElggAutoP.php:11
$text
Definition: button.php:32
__construct()
Constructor.
Definition: ElggAutoP.php:68