Elgg  Version 2.2
 All Classes Namespaces Files Functions Variables Pages
ElggAutoP.php
Go to the documentation of this file.
1 <?php
2 
14 class ElggAutoP {
15 
16  public $encoding = 'UTF-8';
17 
21  protected $_doc = null;
22 
26  protected $_xpath = null;
27 
28  protected $_blocks = 'address article area aside blockquote caption col colgroup dd
29  details div dl dt fieldset figure figcaption footer form h1 h2 h3 h4 h5 h6 header
30  hr hgroup legend map math menu nav noscript p pre section select style summary
31  table tbody td tfoot th thead tr ul ol option li';
32 
36  protected $_inlines = 'a abbr audio b button canvas caption cite code command datalist
37  del dfn em embed i iframe img input ins kbd keygen label map mark meter object
38  output progress q rp rt ruby s samp script select small source span strong style
39  sub sup textarea time var video wbr';
40 
46  protected $_descendList = 'article aside blockquote body details div footer form
47  header section';
48 
54  protected $_alterList = 'article aside blockquote body details div footer header
55  section';
56 
58  protected $_unique = '';
59 
63  public function __construct() {
64  $this->_blocks = preg_split('@\\s+@', $this->_blocks);
65  $this->_descendList = preg_split('@\\s+@', $this->_descendList);
66  $this->_alterList = preg_split('@\\s+@', $this->_alterList);
67  $this->_inlines = preg_split('@\\s+@', $this->_inlines);
68  $this->_unique = md5(__FILE__);
69  }
70 
82  public function process($html) {
83  // normalize whitespace
84  $html = str_replace(array("\r\n", "\r"), "\n", $html);
85 
86  // allows preserving entities untouched
87  $html = str_replace('&', $this->_unique . 'AMP', $html);
88 
89  $this->_doc = new DOMDocument();
90 
91  // parse to DOM, suppressing loadHTML warnings
92  // http://www.php.net/manual/en/domdocument.loadhtml.php#95463
93  libxml_use_internal_errors(true);
94 
95  // Do not load entities. May be unnecessary, better safe than sorry
96  $disable_load_entities = libxml_disable_entity_loader(true);
97 
98  if (!$this->_doc->loadHTML("<html><meta http-equiv='content-type' "
99  . "content='text/html; charset={$this->encoding}'><body>{$html}</body>"
100  . "</html>")) {
101 
102  libxml_disable_entity_loader($disable_load_entities);
103  return false;
104  }
105 
106  libxml_disable_entity_loader($disable_load_entities);
107 
108  $this->_xpath = new DOMXPath($this->_doc);
109  // start processing recursively at the BODY element
110  $nodeList = $this->_xpath->query('//body[1]');
111  $this->addParagraphs($nodeList->item(0));
112 
113  // serialize back to HTML
114  $html = $this->_doc->saveHTML();
115 
116  // Note: we create <autop> elements, which will later be converted to paragraphs
117 
118  // split AUTOPs into multiples at /\n\n+/
119  $html = preg_replace('/(' . $this->_unique . 'NL){2,}/', '</autop><autop>', $html);
120  $html = str_replace(array($this->_unique . 'BR', $this->_unique . 'NL', '<br>'),
121  '<br />',
122  $html);
123  $html = str_replace('<br /></autop>', '</autop>', $html);
124 
125  // re-parse so we can handle new AUTOP elements
126 
127  // Do not load entities. May be unnecessary, better safe than sorry
128  $disable_load_entities = libxml_disable_entity_loader(true);
129 
130  if (!$this->_doc->loadHTML($html)) {
131  libxml_disable_entity_loader($disable_load_entities);
132  return false;
133  }
134 
135  libxml_disable_entity_loader($disable_load_entities);
136 
137  // must re-create XPath object after DOM load
138  $this->_xpath = new DOMXPath($this->_doc);
139 
140  // strip AUTOPs that only have comments/whitespace
141  foreach ($this->_xpath->query('//autop') as $autop) {
142  /* @var DOMElement $autop */
143  $hasContent = false;
144  if (trim($autop->textContent) !== '') {
145  $hasContent = true;
146  } else {
147  foreach ($autop->childNodes as $node) {
148  if ($node->nodeType === XML_ELEMENT_NODE) {
149  $hasContent = true;
150  break;
151  }
152  }
153  }
154  if (!$hasContent) {
155  // mark to be later replaced w/ preg_replace (faster than moving nodes out)
156  $autop->setAttribute("r", "1");
157  }
158  }
159 
160  // If a DIV contains a single AUTOP, remove it
161  foreach ($this->_xpath->query('//div') as $el) {
162  /* @var DOMElement $el */
163  $autops = $this->_xpath->query('./autop', $el);
164  if ($autops->length === 1) {
165  $firstAutop = $autops->item(0);
166  /* @var DOMElement $firstAutop */
167  $firstAutop->setAttribute("r", "1");
168  }
169  }
170 
171  $html = $this->_doc->saveHTML();
172 
173  // trim to the contents of BODY
174  $bodyStart = strpos($html, '<body>');
175  $bodyEnd = strpos($html, '</body>', $bodyStart + 6);
176  $html = substr($html, $bodyStart + 6, $bodyEnd - $bodyStart - 6);
177 
178  // strip AUTOPs that should be removed
179  $html = preg_replace('@<autop r="1">(.*?)</autop>@', '\\1', $html);
180 
181  // commit to converting AUTOPs to Ps
182  $html = str_replace('<autop>', "\n<p>", $html);
183  $html = str_replace('</autop>', "</p>\n", $html);
184 
185  $html = str_replace('<br>', '<br />', $html);
186  $html = str_replace($this->_unique . 'AMP', '&', $html);
187  return $html;
188  }
189 
196  protected function addParagraphs(DOMElement $el) {
197  // no need to call recursively, just queue up
198  $elsToProcess = array($el);
199  $inlinesToProcess = array();
200  while ($el = array_shift($elsToProcess)) {
201  // if true, we can alter all child nodes, if not, we'll just call
202  // addParagraphs on each element in the descendInto list
203  $alterInline = in_array($el->nodeName, $this->_alterList);
204 
205  // inside affected elements, we want to trim leading whitespace from
206  // the first text node
207  $ltrimFirstTextNode = true;
208 
209  // should we open a new AUTOP element to move inline elements into?
210  $openP = true;
211  $autop = null;
212 
213  // after BR, ignore a newline
214  $isFollowingBr = false;
215 
216  $node = $el->firstChild;
217  while (null !== $node) {
218  if ($alterInline) {
219  if ($openP) {
220  $openP = false;
221  // create a P to move inline content into (this may be removed later)
222  $autop = $el->insertBefore($this->_doc->createElement('autop'), $node);
223  }
224  }
225 
226  $isElement = ($node->nodeType === XML_ELEMENT_NODE);
227  if ($isElement) {
228  $isBlock = in_array($node->nodeName, $this->_blocks);
229  if (!$isBlock) {
230  // if we start with an inline element we don't need to do this
231  $ltrimFirstTextNode = false;
232  }
233  } else {
234  $isBlock = false;
235  }
236 
237  if ($alterInline) {
238  $isText = ($node->nodeType === XML_TEXT_NODE);
239  $isLastInline = (! $node->nextSibling
240  || ($node->nextSibling->nodeType === XML_ELEMENT_NODE
241  && in_array($node->nextSibling->nodeName, $this->_blocks)));
242  if ($isElement) {
243  $isFollowingBr = ($node->nodeName === 'br');
244  }
245 
246  if ($isText) {
247  $nodeText = $node->nodeValue;
248 
249  if ($ltrimFirstTextNode) {
250  // we're at the beginning of a sequence of text/inline elements
251  $nodeText = ltrim($nodeText);
252  $ltrimFirstTextNode = false;
253  }
254  if ($isFollowingBr && preg_match('@^[ \\t]*\\n[ \\t]*@', $nodeText, $m)) {
255  // if a user ends a line with <br>, don't add a second BR
256  $nodeText = substr($nodeText, strlen($m[0]));
257  }
258  if ($isLastInline) {
259  // we're at the end of a sequence of text/inline elements
260  $nodeText = rtrim($nodeText);
261  }
262  $nodeText = str_replace("\n", $this->_unique . 'NL', $nodeText);
263  $tmpNode = $node;
264  $node = $node->nextSibling; // move loop to next node
265 
266  // alter node in place, then move into AUTOP
267  $tmpNode->nodeValue = $nodeText;
268  $autop->appendChild($tmpNode);
269 
270  continue;
271  }
272  }
273  if ($isBlock || ! $node->nextSibling) {
274  if ($isBlock) {
275  if (in_array($node->nodeName, $this->_descendList)) {
276  $elsToProcess[] = $node;
277  //$this->addParagraphs($node);
278  }
279  }
280  $openP = true;
281  $ltrimFirstTextNode = true;
282  }
283  if ($alterInline) {
284  if (! $isBlock) {
285  $tmpNode = $node;
286  if ($isElement && false !== strpos($tmpNode->textContent, "\n")) {
287  $inlinesToProcess[] = $tmpNode;
288  }
289  $node = $node->nextSibling;
290  $autop->appendChild($tmpNode);
291  continue;
292  }
293  }
294 
295  $node = $node->nextSibling;
296  }
297  }
298 
299  // handle inline nodes
300  // no need to recurse, just queue up
301  while ($el = array_shift($inlinesToProcess)) {
302  $ignoreLeadingNewline = false;
303  foreach ($el->childNodes as $node) {
304  if ($node->nodeType === XML_ELEMENT_NODE) {
305  if ($node->nodeValue === 'BR') {
306  $ignoreLeadingNewline = true;
307  } else {
308  $ignoreLeadingNewline = false;
309  if (false !== strpos($node->textContent, "\n")) {
310  $inlinesToProcess[] = $node;
311  }
312  }
313  continue;
314  } elseif ($node->nodeType === XML_TEXT_NODE) {
315  $text = $node->nodeValue;
316  if ($text[0] === "\n" && $ignoreLeadingNewline) {
317  $text = substr($text, 1);
318  $ignoreLeadingNewline = false;
319  }
320  $node->nodeValue = str_replace("\n", $this->_unique . 'BR', $text);
321  }
322  }
323  }
324  }
325 }
$m
Definition: metadata.php:11
process($html)
Create wrapper P and BR elements in HTML depending on newlines.
Definition: ElggAutoP.php:82
$text
Definition: default.php:25
foreach($emails as $email) $html
Definition: exceptions.php:34
addParagraphs(DOMElement $el)
Add P and BR elements as necessary.
Definition: ElggAutoP.php:196
__construct()
Constructor.
Definition: ElggAutoP.php:63