1
0
mirror of https://github.com/BookStackApp/BookStack.git synced 2025-07-31 15:24:31 +03:00

Added page content parsing to up-rank header text in search

This adds parsing of page content so that headers apply a boost to
scores in the search term index.
Additionally, this merges title and content terms to reduce the amount
of stored terms a little.
Includes testing to cover.
This commit is contained in:
Dan Brown
2021-11-12 13:47:23 +00:00
parent 820be162f5
commit f28daa01d9
8 changed files with 158 additions and 38 deletions

View File

@ -4,7 +4,10 @@ namespace BookStack\Entities\Tools;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Entities\Models\SearchTerm;
use DOMDocument;
use DOMNode;
use Illuminate\Support\Collection;
class SearchIndex
@ -64,7 +67,8 @@ class SearchIndex
SearchTerm::query()->truncate();
foreach ($this->entityProvider->all() as $entityModel) {
$selectFields = ['id', 'name', $entityModel->textField];
$indexContentField = $entityModel instanceof Page ? 'html' : 'description';
$selectFields = ['id', 'name', $indexContentField];
$total = $entityModel->newQuery()->withTrashed()->count();
$chunkSize = 250;
$processed = 0;
@ -93,11 +97,70 @@ class SearchIndex
}
/**
* Create a scored term array from the given text.
* Create a scored term array from the given text, where the keys are the terms
* and the values are their scores.
*
* @returns array{term: string, score: float}
* @returns array<string, int>
*/
protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array
protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
{
$termMap = $this->textToTermCountMap($text);
foreach ($termMap as $term => $count) {
$termMap[$term] = $count * $scoreAdjustment;
}
return $termMap;
}
/**
* Create a scored term array from the given HTML, where the keys are the terms
* and the values are their scores.
*
* @returns array<string, int>
*/
protected function generateTermScoreMapFromHtml(string $html): array
{
if (empty($html)) {
return [];
}
$scoresByTerm = [];
$elementScoreAdjustmentMap = [
'h1' => 10,
'h2' => 5,
'h3' => 4,
'h4' => 3,
'h5' => 2,
'h6' => 1.5,
];
$html = '<body>' . $html . '</body>';
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
/** @var DOMNode $child */
foreach ($topElems as $child) {
$nodeName = $child->nodeName;
$termCounts = $this->textToTermCountMap(trim($child->textContent));
foreach ($termCounts as $term => $count) {
$scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
$scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
}
}
return $scoresByTerm;
}
/**
* For the given text, return an array where the keys are the unique term words
* and the values are the frequency of that term.
*
* @returns array<string, int>
*/
protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
$splitChars = " \n\t.,!?:;()[]{}<>`'\"";
@ -111,34 +174,61 @@ class SearchIndex
$token = strtok($splitChars);
}
$terms = [];
foreach ($tokenMap as $token => $count) {
$terms[] = [
'term' => $token,
'score' => $count * $scoreAdjustment,
];
}
return $terms;
return $tokenMap;
}
/**
* For the given entity, Generate an array of term data details.
* Is the raw term data, not instances of SearchTerm models.
*
* @returns array{term: string, score: float}[]
* @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
*/
protected function entityToTermDataArray(Entity $entity): array
{
$nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor);
$bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
$termData = array_merge($nameTerms, $bodyTerms);
$nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
foreach ($termData as $index => $term) {
$termData[$index]['entity_type'] = $entity->getMorphClass();
$termData[$index]['entity_id'] = $entity->id;
if ($entity instanceof Page) {
$bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
} else {
$bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor);
}
return $termData;
$mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap);
$dataArray = [];
$entityId = $entity->id;
$entityType = $entity->getMorphClass();
foreach ($mergedScoreMap as $term => $score) {
$dataArray[] = [
'term' => $term,
'score' => $score,
'entity_type' => $entityType,
'entity_id' => $entityId,
];
}
return $dataArray;
}
/**
* For the given term data arrays, Merge their contents by term
* while combining any scores.
*
* @param array<string, int>[] ...$scoreMaps
*
* @returns array<string, int>
*/
protected function mergeTermScoreMaps(...$scoreMaps): array
{
$mergedMap = [];
foreach ($scoreMaps as $scoreMap) {
foreach ($scoreMap as $term => $score) {
$mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
}
}
return $mergedMap;
}
}