mirror of
https://github.com/BookStackApp/BookStack.git
synced 2025-08-07 23:03:00 +03:00
HTML: Aligned and standardised DOMDocument usage
Adds a thin wrapper for DOMDocument to simplify and align usage within all areas of BookStack. Also means we move away from old depreacted mb_convert_encoding usage. Closes #4638
This commit is contained in:
@@ -3,10 +3,8 @@
|
||||
namespace BookStack\Util;
|
||||
|
||||
use DOMAttr;
|
||||
use DOMDocument;
|
||||
use DOMElement;
|
||||
use DOMNodeList;
|
||||
use DOMXPath;
|
||||
|
||||
class HtmlContentFilter
|
||||
{
|
||||
@@ -19,54 +17,44 @@ class HtmlContentFilter
|
||||
return $html;
|
||||
}
|
||||
|
||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
||||
libxml_use_internal_errors(true);
|
||||
$doc = new DOMDocument();
|
||||
$doc->loadHTML($html);
|
||||
$xPath = new DOMXPath($doc);
|
||||
$doc = new HtmlDocument($html);
|
||||
|
||||
// Remove standard script tags
|
||||
$scriptElems = $xPath->query('//script');
|
||||
$scriptElems = $doc->queryXPath('//script');
|
||||
static::removeNodes($scriptElems);
|
||||
|
||||
// Remove clickable links to JavaScript URI
|
||||
$badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']');
|
||||
$badLinks = $doc->queryXPath('//*[' . static::xpathContains('@href', 'javascript:') . ']');
|
||||
static::removeNodes($badLinks);
|
||||
|
||||
// Remove forms with calls to JavaScript URI
|
||||
$badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
|
||||
$badForms = $doc->queryXPath('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
|
||||
static::removeNodes($badForms);
|
||||
|
||||
// Remove meta tag to prevent external redirects
|
||||
$metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']');
|
||||
$metaTags = $doc->queryXPath('//meta[' . static::xpathContains('@content', 'url') . ']');
|
||||
static::removeNodes($metaTags);
|
||||
|
||||
// Remove data or JavaScript iFrames
|
||||
$badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
|
||||
$badIframes = $doc->queryXPath('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
|
||||
static::removeNodes($badIframes);
|
||||
|
||||
// Remove attributes, within svg children, hiding JavaScript or data uris.
|
||||
// A bunch of svg element and attribute combinations expose xss possibilities.
|
||||
// For example, SVG animate tag can exploit javascript in values.
|
||||
$badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
|
||||
$badValuesAttrs = $doc->queryXPath('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
|
||||
static::removeAttributes($badValuesAttrs);
|
||||
|
||||
// Remove elements with a xlink:href attribute
|
||||
// Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
|
||||
$xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]');
|
||||
$xlinkHrefAttributes = $doc->queryXPath('//@*[contains(name(), \'xlink:href\')]');
|
||||
static::removeAttributes($xlinkHrefAttributes);
|
||||
|
||||
// Remove 'on*' attributes
|
||||
$onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');
|
||||
$onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]');
|
||||
static::removeAttributes($onAttributes);
|
||||
|
||||
$html = '';
|
||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
||||
foreach ($topElems as $child) {
|
||||
$html .= $doc->saveHTML($child);
|
||||
}
|
||||
|
||||
return $html;
|
||||
return $doc->getBodyInnerHtml();
|
||||
}
|
||||
|
||||
/**
|
||||
|
152
app/Util/HtmlDocument.php
Normal file
152
app/Util/HtmlDocument.php
Normal file
@@ -0,0 +1,152 @@
|
||||
<?php
|
||||
|
||||
namespace BookStack\Util;
|
||||
|
||||
use DOMDocument;
|
||||
use DOMElement;
|
||||
use DOMNode;
|
||||
use DOMNodeList;
|
||||
use DOMXPath;
|
||||
|
||||
/**
|
||||
* HtmlDocument is a thin wrapper around DOMDocument built
|
||||
* specifically for loading, querying and generating HTML content.
|
||||
*/
|
||||
class HtmlDocument
|
||||
{
|
||||
protected DOMDocument $document;
|
||||
protected ?DOMXPath $xpath = null;
|
||||
protected int $loadOptions;
|
||||
|
||||
public function __construct(string $partialHtml = '', int $loadOptions = 0)
|
||||
{
|
||||
libxml_use_internal_errors(true);
|
||||
$this->document = new DOMDocument();
|
||||
$this->loadOptions = $loadOptions;
|
||||
|
||||
if ($partialHtml) {
|
||||
$this->loadPartialHtml($partialHtml);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load some HTML content that's part of a document (e.g. body content)
|
||||
* into the current document.
|
||||
*/
|
||||
public function loadPartialHtml(string $html): void
|
||||
{
|
||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
||||
$this->document->loadHTML($html, $this->loadOptions);
|
||||
$this->xpath = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a complete page of HTML content into the document.
|
||||
*/
|
||||
public function loadCompleteHtml(string $html): void
|
||||
{
|
||||
$html = '<?xml encoding="utf-8" ?>' . $html;
|
||||
$this->document->loadHTML($html, $this->loadOptions);
|
||||
$this->xpath = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start an XPath query on the current document.
|
||||
*/
|
||||
public function queryXPath(string $expression): DOMNodeList
|
||||
{
|
||||
if (is_null($this->xpath)) {
|
||||
$this->xpath = new DOMXPath($this->document);
|
||||
}
|
||||
|
||||
$result = $this->xpath->query($expression);
|
||||
if ($result === false) {
|
||||
throw new \InvalidArgumentException("XPath query for expression [$expression] failed to execute");
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new DOMElement instance within the document.
|
||||
*/
|
||||
public function createElement(string $localName, string $value = ''): DOMElement
|
||||
{
|
||||
$element = $this->document->createElement($localName, $value);
|
||||
|
||||
if ($element === false) {
|
||||
throw new \InvalidArgumentException("Failed to create element of name [$localName] and value [$value]");
|
||||
}
|
||||
|
||||
return $element;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an element within the document of the given ID.
|
||||
*/
|
||||
public function getElementById(string $elementId): ?DOMElement
|
||||
{
|
||||
return $this->document->getElementById($elementId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the DOMNode that represents the HTML body.
|
||||
*/
|
||||
public function getBody(): DOMNode
|
||||
{
|
||||
return $this->document->getElementsByTagName('body')[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the nodes that are a direct child of the body.
|
||||
* This is usually all the content nodes if loaded partially.
|
||||
*/
|
||||
public function getBodyChildren(): DOMNodeList
|
||||
{
|
||||
return $this->getBody()->childNodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the inner HTML content of the body.
|
||||
* This is usually all the content if loaded partially.
|
||||
*/
|
||||
public function getBodyInnerHtml(): string
|
||||
{
|
||||
$html = '';
|
||||
foreach ($this->getBodyChildren() as $child) {
|
||||
$html .= $this->document->saveHTML($child);
|
||||
}
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the HTML content of the whole document.
|
||||
*/
|
||||
public function getHtml(): string
|
||||
{
|
||||
return $this->document->saveHTML();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the inner HTML for the given node.
|
||||
*/
|
||||
public function getNodeInnerHtml(DOMNode $node): string
|
||||
{
|
||||
$html = '';
|
||||
|
||||
foreach ($node->childNodes as $childNode) {
|
||||
$html .= $this->document->saveHTML($childNode);
|
||||
}
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the outer HTML for the given node.
|
||||
*/
|
||||
public function getNodeOuterHtml(DOMNode $node): string
|
||||
{
|
||||
return $this->document->saveHTML($node);
|
||||
}
|
||||
}
|
@@ -2,14 +2,12 @@
|
||||
|
||||
namespace BookStack\Util;
|
||||
|
||||
use DOMDocument;
|
||||
use DOMElement;
|
||||
use DOMNodeList;
|
||||
use DOMXPath;
|
||||
|
||||
class HtmlNonceApplicator
|
||||
{
|
||||
protected static $placeholder = '[CSP_NONCE_VALUE]';
|
||||
protected static string $placeholder = '[CSP_NONCE_VALUE]';
|
||||
|
||||
/**
|
||||
* Prepare the given HTML content with nonce attributes including a placeholder
|
||||
@@ -21,28 +19,20 @@ class HtmlNonceApplicator
|
||||
return $html;
|
||||
}
|
||||
|
||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
||||
libxml_use_internal_errors(true);
|
||||
$doc = new DOMDocument();
|
||||
$doc->loadHTML($html, LIBXML_SCHEMA_CREATE);
|
||||
$xPath = new DOMXPath($doc);
|
||||
// LIBXML_SCHEMA_CREATE was found to be required here otherwise
|
||||
// the PHP DOMDocument handling will attempt to format/close
|
||||
// HTML tags within scripts and therefore change JS content.
|
||||
$doc = new HtmlDocument($html, LIBXML_SCHEMA_CREATE);
|
||||
|
||||
// Apply to scripts
|
||||
$scriptElems = $xPath->query('//script');
|
||||
$scriptElems = $doc->queryXPath('//script');
|
||||
static::addNonceAttributes($scriptElems, static::$placeholder);
|
||||
|
||||
// Apply to styles
|
||||
$styleElems = $xPath->query('//style');
|
||||
$styleElems = $doc->queryXPath('//style');
|
||||
static::addNonceAttributes($styleElems, static::$placeholder);
|
||||
|
||||
$returnHtml = '';
|
||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
||||
foreach ($topElems as $child) {
|
||||
$content = $doc->saveHTML($child);
|
||||
$returnHtml .= $content;
|
||||
}
|
||||
|
||||
return $returnHtml;
|
||||
return $doc->getBodyInnerHtml();
|
||||
}
|
||||
|
||||
/**
|
||||
|
Reference in New Issue
Block a user