mirror of
https://github.com/BookStackApp/BookStack.git
synced 2025-07-28 17:02:04 +03:00
HTML: Aligned and standardised DOMDocument usage
Adds a thin wrapper for DOMDocument to simplify and align usage within all areas of BookStack. Also means we move away from old depreacted mb_convert_encoding usage. Closes #4638
This commit is contained in:
@ -8,9 +8,8 @@ use BookStack\Entities\Models\Page;
|
||||
use BookStack\Entities\Tools\Markdown\HtmlToMarkdown;
|
||||
use BookStack\Uploads\ImageService;
|
||||
use BookStack\Util\CspService;
|
||||
use DOMDocument;
|
||||
use BookStack\Util\HtmlDocument;
|
||||
use DOMElement;
|
||||
use DOMXPath;
|
||||
use Exception;
|
||||
use Throwable;
|
||||
|
||||
@ -151,45 +150,36 @@ class ExportFormatter
|
||||
protected function htmlToPdf(string $html): string
|
||||
{
|
||||
$html = $this->containHtml($html);
|
||||
$html = $this->replaceIframesWithLinks($html);
|
||||
$html = $this->openDetailElements($html);
|
||||
$doc = new HtmlDocument();
|
||||
$doc->loadCompleteHtml($html);
|
||||
|
||||
return $this->pdfGenerator->fromHtml($html);
|
||||
$this->replaceIframesWithLinks($doc);
|
||||
$this->openDetailElements($doc);
|
||||
$cleanedHtml = $doc->getHtml();
|
||||
|
||||
return $this->pdfGenerator->fromHtml($cleanedHtml);
|
||||
}
|
||||
|
||||
/**
|
||||
* Within the given HTML content, Open any detail blocks.
|
||||
*/
|
||||
protected function openDetailElements(string $html): string
|
||||
protected function openDetailElements(HtmlDocument $doc): void
|
||||
{
|
||||
libxml_use_internal_errors(true);
|
||||
|
||||
$doc = new DOMDocument();
|
||||
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
|
||||
$xPath = new DOMXPath($doc);
|
||||
|
||||
$details = $xPath->query('//details');
|
||||
$details = $doc->queryXPath('//details');
|
||||
/** @var DOMElement $detail */
|
||||
foreach ($details as $detail) {
|
||||
$detail->setAttribute('open', 'open');
|
||||
}
|
||||
|
||||
return $doc->saveHTML();
|
||||
}
|
||||
|
||||
/**
|
||||
* Within the given HTML content, replace any iframe elements
|
||||
* Within the given HTML document, replace any iframe elements
|
||||
* with anchor links within paragraph blocks.
|
||||
*/
|
||||
protected function replaceIframesWithLinks(string $html): string
|
||||
protected function replaceIframesWithLinks(HtmlDocument $doc): void
|
||||
{
|
||||
libxml_use_internal_errors(true);
|
||||
$iframes = $doc->queryXPath('//iframe');
|
||||
|
||||
$doc = new DOMDocument();
|
||||
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
|
||||
$xPath = new DOMXPath($doc);
|
||||
|
||||
$iframes = $xPath->query('//iframe');
|
||||
/** @var DOMElement $iframe */
|
||||
foreach ($iframes as $iframe) {
|
||||
$link = $iframe->getAttribute('src');
|
||||
@ -203,8 +193,6 @@ class ExportFormatter
|
||||
$paragraph->appendChild($anchor);
|
||||
$iframe->parentNode->replaceChild($paragraph, $iframe);
|
||||
}
|
||||
|
||||
return $doc->saveHTML();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -10,11 +10,10 @@ use BookStack\Theming\ThemeEvents;
|
||||
use BookStack\Uploads\ImageRepo;
|
||||
use BookStack\Uploads\ImageService;
|
||||
use BookStack\Util\HtmlContentFilter;
|
||||
use DOMDocument;
|
||||
use BookStack\Util\HtmlDocument;
|
||||
use DOMElement;
|
||||
use DOMNode;
|
||||
use DOMNodeList;
|
||||
use DOMXPath;
|
||||
use Illuminate\Support\Str;
|
||||
|
||||
class PageContent
|
||||
@ -56,27 +55,17 @@ class PageContent
|
||||
return $htmlText;
|
||||
}
|
||||
|
||||
$doc = $this->loadDocumentFromHtml($htmlText);
|
||||
$container = $doc->documentElement;
|
||||
$body = $container->childNodes->item(0);
|
||||
$childNodes = $body->childNodes;
|
||||
$xPath = new DOMXPath($doc);
|
||||
$doc = new HtmlDocument($htmlText);
|
||||
|
||||
// Get all img elements with image data blobs
|
||||
$imageNodes = $xPath->query('//img[contains(@src, \'data:image\')]');
|
||||
$imageNodes = $doc->queryXPath('//img[contains(@src, \'data:image\')]');
|
||||
foreach ($imageNodes as $imageNode) {
|
||||
$imageSrc = $imageNode->getAttribute('src');
|
||||
$newUrl = $this->base64ImageUriToUploadedImageUrl($imageSrc);
|
||||
$imageNode->setAttribute('src', $newUrl);
|
||||
}
|
||||
|
||||
// Generate inner html as a string
|
||||
$html = '';
|
||||
foreach ($childNodes as $childNode) {
|
||||
$html .= $doc->saveHTML($childNode);
|
||||
}
|
||||
|
||||
return $html;
|
||||
return $doc->getBodyInnerHtml();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -172,27 +161,18 @@ class PageContent
|
||||
return $htmlText;
|
||||
}
|
||||
|
||||
$doc = $this->loadDocumentFromHtml($htmlText);
|
||||
$container = $doc->documentElement;
|
||||
$body = $container->childNodes->item(0);
|
||||
$childNodes = $body->childNodes;
|
||||
$xPath = new DOMXPath($doc);
|
||||
$doc = new HtmlDocument($htmlText);
|
||||
|
||||
// Map to hold used ID references
|
||||
$idMap = [];
|
||||
// Map to hold changing ID references
|
||||
$changeMap = [];
|
||||
|
||||
$this->updateIdsRecursively($body, 0, $idMap, $changeMap);
|
||||
$this->updateLinks($xPath, $changeMap);
|
||||
$this->updateIdsRecursively($doc->getBody(), 0, $idMap, $changeMap);
|
||||
$this->updateLinks($doc, $changeMap);
|
||||
|
||||
// Generate inner html as a string
|
||||
$html = '';
|
||||
foreach ($childNodes as $childNode) {
|
||||
$html .= $doc->saveHTML($childNode);
|
||||
}
|
||||
|
||||
// Perform required string-level tweaks
|
||||
// Generate inner html as a string & perform required string-level tweaks
|
||||
$html = $doc->getBodyInnerHtml();
|
||||
$html = str_replace(' ', ' ', $html);
|
||||
|
||||
return $html;
|
||||
@ -225,13 +205,13 @@ class PageContent
|
||||
* Update the all links in the given xpath to apply requires changes within the
|
||||
* given $changeMap array.
|
||||
*/
|
||||
protected function updateLinks(DOMXPath $xpath, array $changeMap): void
|
||||
protected function updateLinks(HtmlDocument $doc, array $changeMap): void
|
||||
{
|
||||
if (empty($changeMap)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$links = $xpath->query('//body//*//*[@href]');
|
||||
$links = $doc->queryXPath('//body//*//*[@href]');
|
||||
/** @var DOMElement $domElem */
|
||||
foreach ($links as $domElem) {
|
||||
$href = ltrim($domElem->getAttribute('href'), '#');
|
||||
@ -321,11 +301,10 @@ class PageContent
|
||||
return [];
|
||||
}
|
||||
|
||||
$doc = $this->loadDocumentFromHtml($htmlContent);
|
||||
$xPath = new DOMXPath($doc);
|
||||
$headers = $xPath->query('//h1|//h2|//h3|//h4|//h5|//h6');
|
||||
$doc = new HtmlDocument($htmlContent);
|
||||
$headers = $doc->queryXPath('//h1|//h2|//h3|//h4|//h5|//h6');
|
||||
|
||||
return $headers ? $this->headerNodesToLevelList($headers) : [];
|
||||
return $headers->count() === 0 ? [] : $this->headerNodesToLevelList($headers);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -420,7 +399,7 @@ class PageContent
|
||||
protected function fetchSectionOfPage(Page $page, string $sectionId): string
|
||||
{
|
||||
$topLevelTags = ['table', 'ul', 'ol', 'pre'];
|
||||
$doc = $this->loadDocumentFromHtml($page->html);
|
||||
$doc = new HtmlDocument($page->html);
|
||||
|
||||
// Search included content for the id given and blank out if not exists.
|
||||
$matchingElem = $doc->getElementById($sectionId);
|
||||
@ -430,30 +409,11 @@ class PageContent
|
||||
|
||||
// Otherwise replace the content with the found content
|
||||
// Checks if the top-level wrapper should be included by matching on tag types
|
||||
$innerContent = '';
|
||||
$isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags);
|
||||
if ($isTopLevel) {
|
||||
$innerContent .= $doc->saveHTML($matchingElem);
|
||||
} else {
|
||||
foreach ($matchingElem->childNodes as $childNode) {
|
||||
$innerContent .= $doc->saveHTML($childNode);
|
||||
}
|
||||
return $doc->getNodeOuterHtml($matchingElem);
|
||||
}
|
||||
libxml_clear_errors();
|
||||
|
||||
return $innerContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and load a DOMDocument from the given html content.
|
||||
*/
|
||||
protected function loadDocumentFromHtml(string $html): DOMDocument
|
||||
{
|
||||
libxml_use_internal_errors(true);
|
||||
$doc = new DOMDocument();
|
||||
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
||||
$doc->loadHTML($html);
|
||||
|
||||
return $doc;
|
||||
return $doc->getNodeInnerHtml($matchingElem);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user