Elgg  Version 1.9
ElggAutoP.php
Go to the documentation of this file.
1 <?php
2 
14 class ElggAutoP {
15 
16  public $encoding = 'UTF-8';
17 
21  protected $_doc = null;
22 
26  protected $_xpath = null;
27 
28  protected $_blocks = 'address article area aside blockquote caption col colgroup dd
29  details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
30  hr hgroup legend map math menu nav noscript p pre section select style summary
31  table tbody td tfoot th thead tr ul ol option li';
32 
36  protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
37  del dfn em embed i iframe img input ins kbd keygen label map mark meter object
38  output progress q rp rt ruby s samp script select small source span strong style
39  sub sup textarea time var video wbr';
40 
46  protected $_descendList = 'article aside blockquote body details div footer form
47  header section';
48 
54  protected $_alterList = 'article aside blockquote body details div footer header
55  section';
56 
58  protected $_unique = '';
59 
63  public function __construct() {
64  $this->_blocks = preg_split('@\\s+@', $this->_blocks);
65  $this->_descendList = preg_split('@\\s+@', $this->_descendList);
66  $this->_alterList = preg_split('@\\s+@', $this->_alterList);
67  $this->_inlines = preg_split('@\\s+@', $this->_inlines);
68  $this->_unique = md5(__FILE__);
69  }
70 
82  public function process($html) {
83  // normalize whitespace
84  $html = str_replace(array("\r\n", "\r"), "\n", $html);
85 
86  // allows preserving entities untouched
87  $html = str_replace('&', $this->_unique . 'AMP', $html);
88 
89  $this->_doc = new DOMDocument();
90 
91  // parse to DOM, suppressing loadHTML warnings
92  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
93  libxml_use_internal_errors(true);
94 
95  // Do not load entities. May be unnecessary, better safe than sorry
96  $disable_load_entities = libxml_disable_entity_loader(true);
97 
98  if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' "
99  . "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
100  . "</html>")) {
101 
102  libxml_disable_entity_loader($disable_load_entities);
103  return false;
104  }
105 
106  libxml_disable_entity_loader($disable_load_entities);
107 
108  $this->_xpath = new DOMXPath($this->_doc);
109  // start processing recursively at the BODY element
110  $nodeList = $this->_xpath->query('//body[1]');
111  $this->addParagraphs($nodeList->item(0));
112 
113  // serialize back to HTML
114  $html = $this->_doc->saveHTML();
115 
116  // Note: we create <autop> elements, which will later be converted to paragraphs
117 
118  // split AUTOPs into multiples at /\n\n+/
119  $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
120  $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '<br>'),
121  '<br />',
122  $html);
123  $html = str_replace('<br /></autop>', '</autop>', $html);
124 
125  // re-parse so we can handle new AUTOP elements
126 
127  // Do not load entities. May be unnecessary, better safe than sorry
128  $disable_load_entities = libxml_disable_entity_loader(true);
129 
130  if (!$this->_doc->loadHTML($html)) {
131  libxml_disable_entity_loader($disable_load_entities);
132  return false;
133  }
134 
135  libxml_disable_entity_loader($disable_load_entities);
136 
137  // must re-create XPath object after DOM load
138  $this->_xpath = new DOMXPath($this->_doc);
139 
140  // strip AUTOPs that only have comments/whitespace
141  foreach ($this->_xpath->query('//autop') as $autop) {
142  /* @var DOMElement $autop */
143  $hasContent = false;
144  if (trim($autop->textContent) !== '') {
145  $hasContent = true;
146  } else {
147  foreach ($autop->childNodes as $node) {
148  if ($node->nodeType === XML_ELEMENT_NODE) {
149  $hasContent = true;
150  break;
151  }
152  }
153  }
154  if (!$hasContent) {
155  // mark to be later replaced w/ preg_replace (faster than moving nodes out)
156  $autop->setAttribute("r", "1");
157  }
158  }
159 
160  // If a DIV contains a single AUTOP, remove it
161  foreach ($this->_xpath->query('//div') as $el) {
162  /* @var DOMElement $el */
163  $autops = $this->_xpath->query('./autop', $el);
164  if ($autops->length === 1) {
165  $firstAutop = $autops->item(0);
166  /* @var DOMElement $firstAutop */
167  $firstAutop->setAttribute("r", "1");
168  }
169  }
170 
171  $html = $this->_doc->saveHTML();
172 
173  // trim to the contents of BODY
174  $bodyStart = strpos($html, '<body>');
175  $bodyEnd = strpos($html, '</body>', $bodyStart + 6);
176  $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
177 
178  // strip AUTOPs that should be removed
179  $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
180 
181  // commit to converting AUTOPs to Ps
182  $html = str_replace('<autop>', "\n<p>", $html);
183  $html = str_replace('</autop>', "</p>\n", $html);
184 
185  $html = str_replace('<br>', '<br />', $html);
186  $html = str_replace($this->_unique . 'AMP', '&', $html);
187  return $html;
188  }
189 
196  protected function addParagraphs(DOMElement $el) {
197  // no need to call recursively, just queue up
198  $elsToProcess = array($el);
199  $inlinesToProcess = array();
200  while ($el = array_shift($elsToProcess)) {
201  // if true, we can alter all child nodes, if not, we'll just call
202  // addParagraphs on each element in the descendInto list
203  $alterInline = in_array($el->nodeName, $this->_alterList);
204 
205  // inside affected elements, we want to trim leading whitespace from
206  // the first text node
207  $ltrimFirstTextNode = true;
208 
209  // should we open a new AUTOP element to move inline elements into?
210  $openP = true;
211  $autop = null;
212 
213  // after BR, ignore a newline
214  $isFollowingBr = false;
215 
216  $node = $el->firstChild;
217  while (null !== $node) {
218  if ($alterInline) {
219  if ($openP) {
220  $openP = false;
221  // create a P to move inline content into (this may be removed later)
222  $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
223  }
224  }
225 
226  $isElement = ($node->nodeType === XML_ELEMENT_NODE);
227  if ($isElement) {
228  $isBlock = in_array($node->nodeName, $this->_blocks);
229  } else {
230  $isBlock = false;
231  }
232 
233  if ($alterInline) {
234  $isText = ($node->nodeType === XML_TEXT_NODE);
235  $isLastInline = (! $node->nextSibling
236  || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
237  && in_array($node->nextSibling->nodeName, $this->_blocks)));
238  if ($isElement) {
239  $isFollowingBr = ($node->nodeName === 'br');
240  }
241 
242  if ($isText) {
243  $nodeText = $node->nodeValue;
244  if ($ltrimFirstTextNode) {
245  $nodeText = ltrim($nodeText);
246  $ltrimFirstTextNode = false;
247  }
248  if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
249  // if a user ends a line with <br>, don't add a second BR
250  $nodeText = substr($nodeText, strlen($m[0]));
251  }
252  if ($isLastInline) {
253  $nodeText = rtrim($nodeText);
254  }
255  $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
256  $tmpNode = $node;
257  $node = $node->nextSibling; // move loop to next node
258 
259  // alter node in place, then move into AUTOP
260  $tmpNode->nodeValue = $nodeText;
261  $autop->appendChild($tmpNode);
262 
263  continue;
264  }
265  }
266  if ($isBlock || ! $node->nextSibling) {
267  if ($isBlock) {
268  if (in_array($node->nodeName, $this->_descendList)) {
269  $elsToProcess[] = $node;
270  //$this->addParagraphs($node);
271  }
272  }
273  $openP = true;
274  $ltrimFirstTextNode = true;
275  }
276  if ($alterInline) {
277  if (! $isBlock) {
278  $tmpNode = $node;
279  if ($isElement && false !== strpos($tmpNode->textContent, "\n")) {
280  $inlinesToProcess[] = $tmpNode;
281  }
282  $node = $node->nextSibling;
283  $autop->appendChild($tmpNode);
284  continue;
285  }
286  }
287 
288  $node = $node->nextSibling;
289  }
290  }
291 
292  // handle inline nodes
293  // no need to recurse, just queue up
294  while ($el = array_shift($inlinesToProcess)) {
295  $ignoreLeadingNewline = false;
296  foreach ($el->childNodes as $node) {
297  if ($node->nodeType === XML_ELEMENT_NODE) {
298  if ($node->nodeValue === 'BR') {
299  $ignoreLeadingNewline = true;
300  } else {
301  $ignoreLeadingNewline = false;
302  if (false !== strpos($node->textContent, "\n")) {
303  $inlinesToProcess[] = $node;
304  }
305  }
306  continue;
307  } elseif ($node->nodeType === XML_TEXT_NODE) {
308  $text = $node->nodeValue;
309  if ($text[0] === "\n" && $ignoreLeadingNewline) {
310  $text = substr($text, 1);
311  $ignoreLeadingNewline = false;
312  }
313  $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
314  }
315  }
316  }
317  }
318 }
$m
Definition: metadata.php:11
process($html)
Create wrapper P and BR elements in HTML depending on newlines.
Definition: ElggAutoP.php:82
$text
Definition: default.php:25
foreach($emails as $email) $html
Definition: exceptions.php:34
addParagraphs(DOMElement $el)
Add P and BR elements as necessary.
Definition: ElggAutoP.php:196
__construct()
Constructor.
Definition: ElggAutoP.php:63