mirror of
				https://github.com/BookStackApp/BookStack.git
				synced 2025-10-26 17:31:27 +03:00 
			
		
		
		
	Added page content parsing to up-rank header text in search
This adds parsing of page content so that headers apply a boost to scores in the search term index. Additionally, this merges title and content terms to reduce the amount of stored terms a little. Includes testing to cover.
This commit is contained in:
		| @@ -24,7 +24,7 @@ class Book extends Entity implements HasCoverImage | |||||||
| { | { | ||||||
|     use HasFactory; |     use HasFactory; | ||||||
|  |  | ||||||
|     public $searchFactor = 2; |     public $searchFactor = 1.5; | ||||||
|  |  | ||||||
|     protected $fillable = ['name', 'description']; |     protected $fillable = ['name', 'description']; | ||||||
|     protected $hidden = ['restricted', 'pivot', 'image_id', 'deleted_at']; |     protected $hidden = ['restricted', 'pivot', 'image_id', 'deleted_at']; | ||||||
|   | |||||||
| @@ -13,7 +13,7 @@ class Bookshelf extends Entity implements HasCoverImage | |||||||
|  |  | ||||||
|     protected $table = 'bookshelves'; |     protected $table = 'bookshelves'; | ||||||
|  |  | ||||||
|     public $searchFactor = 3; |     public $searchFactor = 1.5; | ||||||
|  |  | ||||||
|     protected $fillable = ['name', 'description', 'image_id']; |     protected $fillable = ['name', 'description', 'image_id']; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -16,7 +16,7 @@ class Chapter extends BookChild | |||||||
| { | { | ||||||
|     use HasFactory; |     use HasFactory; | ||||||
|  |  | ||||||
|     public $searchFactor = 1.3; |     public $searchFactor = 1.5; | ||||||
|  |  | ||||||
|     protected $fillable = ['name', 'description', 'priority', 'book_id']; |     protected $fillable = ['name', 'description', 'priority', 'book_id']; | ||||||
|     protected $hidden = ['restricted', 'pivot', 'deleted_at']; |     protected $hidden = ['restricted', 'pivot', 'deleted_at']; | ||||||
|   | |||||||
| @@ -238,20 +238,12 @@ abstract class Entity extends Model implements Sluggable, Favouritable, Viewable | |||||||
|         return mb_substr($this->name, 0, $length - 3) . '...'; |         return mb_substr($this->name, 0, $length - 3) . '...'; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /** |  | ||||||
|      * Get the body text of this entity. |  | ||||||
|      */ |  | ||||||
|     public function getText(): string |  | ||||||
|     { |  | ||||||
|         return $this->{$this->textField} ?? ''; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /** |     /** | ||||||
|      * Get an excerpt of this entity's descriptive content to the specified length. |      * Get an excerpt of this entity's descriptive content to the specified length. | ||||||
|      */ |      */ | ||||||
|     public function getExcerpt(int $length = 100): string |     public function getExcerpt(int $length = 100): string | ||||||
|     { |     { | ||||||
|         $text = $this->getText(); |         $text = $this->{$this->textField} ?? ''; | ||||||
|  |  | ||||||
|         if (mb_strlen($text) > $length) { |         if (mb_strlen($text) > $length) { | ||||||
|             $text = mb_substr($text, 0, $length - 3) . '...'; |             $text = mb_substr($text, 0, $length - 3) . '...'; | ||||||
|   | |||||||
| @@ -3,13 +3,13 @@ | |||||||
| namespace BookStack\Entities\Models; | namespace BookStack\Entities\Models; | ||||||
|  |  | ||||||
| use BookStack\Entities\Tools\PageContent; | use BookStack\Entities\Tools\PageContent; | ||||||
|  | use BookStack\Facades\Permissions; | ||||||
| use BookStack\Uploads\Attachment; | use BookStack\Uploads\Attachment; | ||||||
| use Illuminate\Database\Eloquent\Builder; | use Illuminate\Database\Eloquent\Builder; | ||||||
| use Illuminate\Database\Eloquent\Collection; | use Illuminate\Database\Eloquent\Collection; | ||||||
| use Illuminate\Database\Eloquent\Factories\HasFactory; | use Illuminate\Database\Eloquent\Factories\HasFactory; | ||||||
| use Illuminate\Database\Eloquent\Relations\BelongsTo; | use Illuminate\Database\Eloquent\Relations\BelongsTo; | ||||||
| use Illuminate\Database\Eloquent\Relations\HasMany; | use Illuminate\Database\Eloquent\Relations\HasMany; | ||||||
| use Permissions; |  | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * Class Page. |  * Class Page. | ||||||
| @@ -64,10 +64,8 @@ class Page extends BookChild | |||||||
|  |  | ||||||
|     /** |     /** | ||||||
|      * Check if this page has a chapter. |      * Check if this page has a chapter. | ||||||
|      * |  | ||||||
|      * @return bool |  | ||||||
|      */ |      */ | ||||||
|     public function hasChapter() |     public function hasChapter(): bool | ||||||
|     { |     { | ||||||
|         return $this->chapter()->count() > 0; |         return $this->chapter()->count() > 0; | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -157,8 +157,8 @@ class PageRepo | |||||||
|      */ |      */ | ||||||
|     public function publishDraft(Page $draft, array $input): Page |     public function publishDraft(Page $draft, array $input): Page | ||||||
|     { |     { | ||||||
|         $this->baseRepo->update($draft, $input); |  | ||||||
|         $this->updateTemplateStatusAndContentFromInput($draft, $input); |         $this->updateTemplateStatusAndContentFromInput($draft, $input); | ||||||
|  |         $this->baseRepo->update($draft, $input); | ||||||
|  |  | ||||||
|         $draft->draft = false; |         $draft->draft = false; | ||||||
|         $draft->revision_count = 1; |         $draft->revision_count = 1; | ||||||
|   | |||||||
| @@ -4,7 +4,10 @@ namespace BookStack\Entities\Tools; | |||||||
|  |  | ||||||
| use BookStack\Entities\EntityProvider; | use BookStack\Entities\EntityProvider; | ||||||
| use BookStack\Entities\Models\Entity; | use BookStack\Entities\Models\Entity; | ||||||
|  | use BookStack\Entities\Models\Page; | ||||||
| use BookStack\Entities\Models\SearchTerm; | use BookStack\Entities\Models\SearchTerm; | ||||||
|  | use DOMDocument; | ||||||
|  | use DOMNode; | ||||||
| use Illuminate\Support\Collection; | use Illuminate\Support\Collection; | ||||||
|  |  | ||||||
| class SearchIndex | class SearchIndex | ||||||
| @@ -64,7 +67,8 @@ class SearchIndex | |||||||
|         SearchTerm::query()->truncate(); |         SearchTerm::query()->truncate(); | ||||||
|  |  | ||||||
|         foreach ($this->entityProvider->all() as $entityModel) { |         foreach ($this->entityProvider->all() as $entityModel) { | ||||||
|             $selectFields = ['id', 'name', $entityModel->textField]; |             $indexContentField = $entityModel instanceof Page ? 'html' : 'description'; | ||||||
|  |             $selectFields = ['id', 'name', $indexContentField]; | ||||||
|             $total = $entityModel->newQuery()->withTrashed()->count(); |             $total = $entityModel->newQuery()->withTrashed()->count(); | ||||||
|             $chunkSize = 250; |             $chunkSize = 250; | ||||||
|             $processed = 0; |             $processed = 0; | ||||||
| @@ -93,11 +97,70 @@ class SearchIndex | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     /** |     /** | ||||||
|      * Create a scored term array from the given text. |      * Create a scored term array from the given text, where the keys are the terms | ||||||
|  |      * and the values are their scores. | ||||||
|      * |      * | ||||||
|      * @returns array{term: string, score: float} |      * @returns array<string, int> | ||||||
|      */ |      */ | ||||||
|     protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array |     protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array | ||||||
|  |     { | ||||||
|  |         $termMap = $this->textToTermCountMap($text); | ||||||
|  |  | ||||||
|  |         foreach ($termMap as $term => $count) { | ||||||
|  |             $termMap[$term] = $count * $scoreAdjustment; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return $termMap; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Create a scored term array from the given HTML, where the keys are the terms | ||||||
|  |      * and the values are their scores. | ||||||
|  |      * | ||||||
|  |      * @returns array<string, int> | ||||||
|  |      */ | ||||||
|  |     protected function generateTermScoreMapFromHtml(string $html): array | ||||||
|  |     { | ||||||
|  |         if (empty($html)) { | ||||||
|  |             return []; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         $scoresByTerm = []; | ||||||
|  |         $elementScoreAdjustmentMap = [ | ||||||
|  |             'h1' => 10, | ||||||
|  |             'h2' => 5, | ||||||
|  |             'h3' => 4, | ||||||
|  |             'h4' => 3, | ||||||
|  |             'h5' => 2, | ||||||
|  |             'h6' => 1.5, | ||||||
|  |         ]; | ||||||
|  |  | ||||||
|  |         $html = '<body>' . $html . '</body>'; | ||||||
|  |         libxml_use_internal_errors(true); | ||||||
|  |         $doc = new DOMDocument(); | ||||||
|  |         $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); | ||||||
|  |  | ||||||
|  |         $topElems = $doc->documentElement->childNodes->item(0)->childNodes; | ||||||
|  |         /** @var DOMNode $child */ | ||||||
|  |         foreach ($topElems as $child) { | ||||||
|  |             $nodeName = $child->nodeName; | ||||||
|  |             $termCounts = $this->textToTermCountMap(trim($child->textContent)); | ||||||
|  |             foreach ($termCounts as $term => $count) { | ||||||
|  |                 $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1); | ||||||
|  |                 $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return $scoresByTerm; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * For the given text, return an array where the keys are the unique term words | ||||||
|  |      * and the values are the frequency of that term. | ||||||
|  |      * | ||||||
|  |      * @returns array<string, int> | ||||||
|  |      */ | ||||||
|  |     protected function textToTermCountMap(string $text): array | ||||||
|     { |     { | ||||||
|         $tokenMap = []; // {TextToken => OccurrenceCount} |         $tokenMap = []; // {TextToken => OccurrenceCount} | ||||||
|         $splitChars = " \n\t.,!?:;()[]{}<>`'\""; |         $splitChars = " \n\t.,!?:;()[]{}<>`'\""; | ||||||
| @@ -111,34 +174,61 @@ class SearchIndex | |||||||
|             $token = strtok($splitChars); |             $token = strtok($splitChars); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         $terms = []; |         return $tokenMap; | ||||||
|         foreach ($tokenMap as $token => $count) { |  | ||||||
|             $terms[] = [ |  | ||||||
|                 'term'  => $token, |  | ||||||
|                 'score' => $count * $scoreAdjustment, |  | ||||||
|             ]; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         return $terms; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /** |     /** | ||||||
|      * For the given entity, Generate an array of term data details. |      * For the given entity, Generate an array of term data details. | ||||||
|      * Is the raw term data, not instances of SearchTerm models. |      * Is the raw term data, not instances of SearchTerm models. | ||||||
|      * |      * | ||||||
|      * @returns array{term: string, score: float}[] |      * @returns array{term: string, score: float, entity_id: int, entity_type: string}[] | ||||||
|      */ |      */ | ||||||
|     protected function entityToTermDataArray(Entity $entity): array |     protected function entityToTermDataArray(Entity $entity): array | ||||||
|     { |     { | ||||||
|         $nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor); |         $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor); | ||||||
|         $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor); |  | ||||||
|         $termData = array_merge($nameTerms, $bodyTerms); |  | ||||||
|  |  | ||||||
|         foreach ($termData as $index => $term) { |         if ($entity instanceof Page) { | ||||||
|             $termData[$index]['entity_type'] = $entity->getMorphClass(); |             $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html); | ||||||
|             $termData[$index]['entity_id'] = $entity->id; |         } else { | ||||||
|  |             $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         return $termData; |         $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap); | ||||||
|  |  | ||||||
|  |         $dataArray = []; | ||||||
|  |         $entityId = $entity->id; | ||||||
|  |         $entityType = $entity->getMorphClass(); | ||||||
|  |         foreach ($mergedScoreMap as $term => $score) { | ||||||
|  |             $dataArray[] = [ | ||||||
|  |                 'term' => $term, | ||||||
|  |                 'score' => $score, | ||||||
|  |                 'entity_type' => $entityType, | ||||||
|  |                 'entity_id' => $entityId, | ||||||
|  |             ]; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return $dataArray; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * For the given term data arrays, Merge their contents by term | ||||||
|  |      * while combining any scores. | ||||||
|  |      * | ||||||
|  |      * @param array<string, int>[] ...$scoreMaps | ||||||
|  |      * | ||||||
|  |      * @returns array<string, int> | ||||||
|  |      */ | ||||||
|  |     protected function mergeTermScoreMaps(...$scoreMaps): array | ||||||
|  |     { | ||||||
|  |         $mergedMap = []; | ||||||
|  |  | ||||||
|  |         foreach ($scoreMaps as $scoreMap) { | ||||||
|  |             foreach ($scoreMap as $term => $score) { | ||||||
|  |                 $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return $mergedMap; | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ use BookStack\Entities\Models\Book; | |||||||
| use BookStack\Entities\Models\Bookshelf; | use BookStack\Entities\Models\Bookshelf; | ||||||
| use BookStack\Entities\Models\Chapter; | use BookStack\Entities\Models\Chapter; | ||||||
| use BookStack\Entities\Models\Page; | use BookStack\Entities\Models\Page; | ||||||
|  | use BookStack\Entities\Models\SearchTerm; | ||||||
| use Tests\TestCase; | use Tests\TestCase; | ||||||
|  |  | ||||||
| class EntitySearchTest extends TestCase | class EntitySearchTest extends TestCase | ||||||
| @@ -320,4 +321,43 @@ class EntitySearchTest extends TestCase | |||||||
|         $search->assertElementContains('.entity-list > .page', 'Test page B', 1); |         $search->assertElementContains('.entity-list > .page', 'Test page B', 1); | ||||||
|         $search->assertElementContains('.entity-list > .page', 'Test page A', 2); |         $search->assertElementContains('.entity-list > .page', 'Test page A', 2); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     public function test_terms_in_headers_have_an_adjusted_index_score() | ||||||
|  |     { | ||||||
|  |         $page = $this->newPage(['name' => 'Test page A', 'html' => ' | ||||||
|  |             <p>TermA</p> | ||||||
|  |             <h1>TermB <strong>TermNested</strong></h1> | ||||||
|  |             <h2>TermC</h2> | ||||||
|  |             <h3>TermD</h3> | ||||||
|  |             <h4>TermE</h4> | ||||||
|  |             <h5>TermF</h5> | ||||||
|  |             <h6>TermG</h6> | ||||||
|  |         ']); | ||||||
|  |  | ||||||
|  |         $entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page']; | ||||||
|  |         $scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term'); | ||||||
|  |  | ||||||
|  |         $this->assertEquals(1, $scoreByTerm->get('TermA')); | ||||||
|  |         $this->assertEquals(10, $scoreByTerm->get('TermB')); | ||||||
|  |         $this->assertEquals(10, $scoreByTerm->get('TermNested')); | ||||||
|  |         $this->assertEquals(5, $scoreByTerm->get('TermC')); | ||||||
|  |         $this->assertEquals(4, $scoreByTerm->get('TermD')); | ||||||
|  |         $this->assertEquals(3, $scoreByTerm->get('TermE')); | ||||||
|  |         $this->assertEquals(2, $scoreByTerm->get('TermF')); | ||||||
|  |         // Is 1.5 but stored as integer, rounding up | ||||||
|  |         $this->assertEquals(2, $scoreByTerm->get('TermG')); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     public function test_name_and_content_terms_are_merged_to_single_score() | ||||||
|  |     { | ||||||
|  |         $page = $this->newPage(['name' => 'TermA', 'html' => ' | ||||||
|  |             <p>TermA</p> | ||||||
|  |         ']); | ||||||
|  |  | ||||||
|  |         $entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page']; | ||||||
|  |         $scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term'); | ||||||
|  |  | ||||||
|  |         // Scores 40 for being in the name then 1 for being in the content | ||||||
|  |         $this->assertEquals(41, $scoreByTerm->get('TermA')); | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user