mirror of
https://github.com/BookStackApp/BookStack.git
synced 2025-07-31 15:24:31 +03:00
Added page content parsing to up-rank header text in search
This adds parsing of page content so that headers apply a boost to scores in the search term index. Additionally, this merges title and content terms to reduce the amount of stored terms a little. Includes testing to cover.
This commit is contained in:
@ -4,7 +4,10 @@ namespace BookStack\Entities\Tools;
|
||||
|
||||
use BookStack\Entities\EntityProvider;
|
||||
use BookStack\Entities\Models\Entity;
|
||||
use BookStack\Entities\Models\Page;
|
||||
use BookStack\Entities\Models\SearchTerm;
|
||||
use DOMDocument;
|
||||
use DOMNode;
|
||||
use Illuminate\Support\Collection;
|
||||
|
||||
class SearchIndex
|
||||
@ -64,7 +67,8 @@ class SearchIndex
|
||||
SearchTerm::query()->truncate();
|
||||
|
||||
foreach ($this->entityProvider->all() as $entityModel) {
|
||||
$selectFields = ['id', 'name', $entityModel->textField];
|
||||
$indexContentField = $entityModel instanceof Page ? 'html' : 'description';
|
||||
$selectFields = ['id', 'name', $indexContentField];
|
||||
$total = $entityModel->newQuery()->withTrashed()->count();
|
||||
$chunkSize = 250;
|
||||
$processed = 0;
|
||||
@ -93,11 +97,70 @@ class SearchIndex
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a scored term array from the given text.
|
||||
* Create a scored term array from the given text, where the keys are the terms
|
||||
* and the values are their scores.
|
||||
*
|
||||
* @returns array{term: string, score: float}
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array
|
||||
protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
|
||||
{
|
||||
$termMap = $this->textToTermCountMap($text);
|
||||
|
||||
foreach ($termMap as $term => $count) {
|
||||
$termMap[$term] = $count * $scoreAdjustment;
|
||||
}
|
||||
|
||||
return $termMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a scored term array from the given HTML, where the keys are the terms
|
||||
* and the values are their scores.
|
||||
*
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function generateTermScoreMapFromHtml(string $html): array
|
||||
{
|
||||
if (empty($html)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$scoresByTerm = [];
|
||||
$elementScoreAdjustmentMap = [
|
||||
'h1' => 10,
|
||||
'h2' => 5,
|
||||
'h3' => 4,
|
||||
'h4' => 3,
|
||||
'h5' => 2,
|
||||
'h6' => 1.5,
|
||||
];
|
||||
|
||||
$html = '<body>' . $html . '</body>';
|
||||
libxml_use_internal_errors(true);
|
||||
$doc = new DOMDocument();
|
||||
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
|
||||
|
||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
||||
/** @var DOMNode $child */
|
||||
foreach ($topElems as $child) {
|
||||
$nodeName = $child->nodeName;
|
||||
$termCounts = $this->textToTermCountMap(trim($child->textContent));
|
||||
foreach ($termCounts as $term => $count) {
|
||||
$scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
|
||||
$scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
|
||||
}
|
||||
}
|
||||
|
||||
return $scoresByTerm;
|
||||
}
|
||||
|
||||
/**
|
||||
* For the given text, return an array where the keys are the unique term words
|
||||
* and the values are the frequency of that term.
|
||||
*
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function textToTermCountMap(string $text): array
|
||||
{
|
||||
$tokenMap = []; // {TextToken => OccurrenceCount}
|
||||
$splitChars = " \n\t.,!?:;()[]{}<>`'\"";
|
||||
@ -111,34 +174,61 @@ class SearchIndex
|
||||
$token = strtok($splitChars);
|
||||
}
|
||||
|
||||
$terms = [];
|
||||
foreach ($tokenMap as $token => $count) {
|
||||
$terms[] = [
|
||||
'term' => $token,
|
||||
'score' => $count * $scoreAdjustment,
|
||||
];
|
||||
}
|
||||
|
||||
return $terms;
|
||||
return $tokenMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* For the given entity, Generate an array of term data details.
|
||||
* Is the raw term data, not instances of SearchTerm models.
|
||||
*
|
||||
* @returns array{term: string, score: float}[]
|
||||
* @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
|
||||
*/
|
||||
protected function entityToTermDataArray(Entity $entity): array
|
||||
{
|
||||
$nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor);
|
||||
$bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
|
||||
$termData = array_merge($nameTerms, $bodyTerms);
|
||||
$nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
|
||||
|
||||
foreach ($termData as $index => $term) {
|
||||
$termData[$index]['entity_type'] = $entity->getMorphClass();
|
||||
$termData[$index]['entity_id'] = $entity->id;
|
||||
if ($entity instanceof Page) {
|
||||
$bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
|
||||
} else {
|
||||
$bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor);
|
||||
}
|
||||
|
||||
return $termData;
|
||||
$mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap);
|
||||
|
||||
$dataArray = [];
|
||||
$entityId = $entity->id;
|
||||
$entityType = $entity->getMorphClass();
|
||||
foreach ($mergedScoreMap as $term => $score) {
|
||||
$dataArray[] = [
|
||||
'term' => $term,
|
||||
'score' => $score,
|
||||
'entity_type' => $entityType,
|
||||
'entity_id' => $entityId,
|
||||
];
|
||||
}
|
||||
|
||||
return $dataArray;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For the given term data arrays, Merge their contents by term
|
||||
* while combining any scores.
|
||||
*
|
||||
* @param array<string, int>[] ...$scoreMaps
|
||||
*
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function mergeTermScoreMaps(...$scoreMaps): array
|
||||
{
|
||||
$mergedMap = [];
|
||||
|
||||
foreach ($scoreMaps as $scoreMap) {
|
||||
foreach ($scoreMap as $term => $score) {
|
||||
$mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
|
||||
}
|
||||
}
|
||||
|
||||
return $mergedMap;
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user