1
0
mirror of https://github.com/BookStackApp/BookStack.git synced 2025-08-07 23:03:00 +03:00

HTML: Aligned and standardised DOMDocument usage

Adds a thin wrapper for DOMDocument to simplify and align usage within
all areas of BookStack.
Also means we move away from old depreacted mb_convert_encoding usage.

Closes #4638
This commit is contained in:
Dan Brown
2023-11-14 15:46:32 +00:00
parent 3a6f50e668
commit db7b11fe93
8 changed files with 214 additions and 163 deletions

View File

@@ -3,10 +3,8 @@
namespace BookStack\Util;
use DOMAttr;
use DOMDocument;
use DOMElement;
use DOMNodeList;
use DOMXPath;
class HtmlContentFilter
{
@@ -19,54 +17,44 @@ class HtmlContentFilter
return $html;
}
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML($html);
$xPath = new DOMXPath($doc);
$doc = new HtmlDocument($html);
// Remove standard script tags
$scriptElems = $xPath->query('//script');
$scriptElems = $doc->queryXPath('//script');
static::removeNodes($scriptElems);
// Remove clickable links to JavaScript URI
$badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']');
$badLinks = $doc->queryXPath('//*[' . static::xpathContains('@href', 'javascript:') . ']');
static::removeNodes($badLinks);
// Remove forms with calls to JavaScript URI
$badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
$badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
static::removeNodes($badForms);
// Remove meta tag to prevent external redirects
$metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']');
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
static::removeNodes($metaTags);
// Remove data or JavaScript iFrames
$badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
$badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
static::removeNodes($badIframes);
// Remove attributes, within svg children, hiding JavaScript or data uris.
// A bunch of svg element and attribute combinations expose xss possibilities.
// For example, SVG animate tag can exploit javascript in values.
$badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
$badValuesAttrs = $doc->queryXPath('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
static::removeAttributes($badValuesAttrs);
// Remove elements with a xlink:href attribute
// Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
$xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]');
$xlinkHrefAttributes = $doc->queryXPath('//@*[contains(name(), \'xlink:href\')]');
static::removeAttributes($xlinkHrefAttributes);
// Remove 'on*' attributes
$onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');
$onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
static::removeAttributes($onAttributes);
$html = '';
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
foreach ($topElems as $child) {
$html .= $doc->saveHTML($child);
}
return $html;
return $doc->getBodyInnerHtml();
}
/**