1
0
mirror of https://github.com/BookStackApp/BookStack.git synced 2025-10-22 07:52:19 +03:00

Improved vector text chunking

This commit is contained in:
Dan Brown
2025-08-19 11:04:14 +01:00
parent e611b3239e
commit 54f883e815
4 changed files with 140 additions and 9 deletions

View File

@@ -141,8 +141,12 @@ class SearchController extends Controller
return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']); return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
} }
/**
* Perform a vector/LLM-based query search.
*/
public function searchQuery(Request $request, VectorSearchRunner $runner) public function searchQuery(Request $request, VectorSearchRunner $runner)
{ {
// TODO - Validate if query system is active
$query = $request->get('query', ''); $query = $request->get('query', '');
if ($query) { if ($query) {

View File

@@ -2,6 +2,7 @@
namespace BookStack\Search\Vectors; namespace BookStack\Search\Vectors;
use BookStack\Activity\Models\Tag;
use BookStack\Entities\Models\Entity; use BookStack\Entities\Models\Entity;
use BookStack\Search\Vectors\Services\VectorQueryService; use BookStack\Search\Vectors\Services\VectorQueryService;
use Illuminate\Support\Facades\DB; use Illuminate\Support\Facades\DB;
@@ -47,8 +48,10 @@ class EntityVectorGenerator
]; ];
} }
// TODO - Chunk inserts $chunks = array_chunk($toInsert, 500);
SearchVector::query()->insert($toInsert); foreach ($chunks as $chunk) {
SearchVector::query()->insert($chunk);
}
} }
/** /**
@@ -69,16 +72,16 @@ class EntityVectorGenerator
*/ */
protected function chunkText(string $text): array protected function chunkText(string $text): array
{ {
// TODO - Join adjacent smaller chunks up return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text);
return array_filter(array_map(function (string $section): string {
return trim($section);
}, explode("\n", $text)));
} }
protected function entityToPlainText(Entity $entity): string protected function entityToPlainText(Entity $entity): string
{ {
$text = $entity->name . "\n\n" . $entity->{$entity->textField}; $tags = $entity->tags()->get();
// TODO - Add tags $tagText = $tags->map(function (Tag $tag) {
return $text; return $tag->name . ': ' . $tag->value;
})->join('\n');
return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField};
} }
} }

View File

@@ -0,0 +1,77 @@
<?php
namespace BookStack\Search\Vectors;
use InvalidArgumentException;
/**
* Splits a given string into smaller chunks based on specified delimiters
* and a predefined maximum chunk size. This will work through the given delimiters
* to break down text further and further to fit into the chunk size.
*
* The last delimiter is always an empty string to ensure text can always be broken down.
*/
class TextChunker
{
public function __construct(
protected int $chunkSize,
protected array $delimiterOrder,
) {
if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
$this->delimiterOrder[] = '';
}
if ($this->chunkSize < 1) {
throw new InvalidArgumentException('Chunk size must be greater than 0');
}
}
public function chunk(string $text): array
{
$delimiter = $this->delimiterOrder[0];
$delimiterLength = strlen($delimiter);
$lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
$cChunk = ''; // Current chunk
$cLength = 0; // Current chunk length
$chunks = []; // Chunks to return
$lDelim = ''; // Last delimiter
foreach ($lines as $index => $line) {
$lineLength = strlen($line);
if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
$cChunk .= $line . $delimiter;
$cLength += $lineLength + $delimiterLength;
$lDelim = $delimiter;
} else if ($lineLength <= $this->chunkSize) {
$chunks[] = trim($cChunk, $delimiter);
$cChunk = $line . $delimiter;
$cLength = $lineLength + $delimiterLength;
$lDelim = $delimiter;
} else {
$subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
$subDelimiter = $this->delimiterOrder[1] ?? '';
$subDelimiterLength = strlen($subDelimiter);
foreach ($subChunks->chunk($line) as $subChunk) {
$chunkLength = strlen($subChunk);
if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
$cChunk .= $subChunk . $subDelimiter;
$cLength += $chunkLength + $subDelimiterLength;
$lDelim = $subDelimiter;
} else {
$chunks[] = trim($cChunk, $lDelim);
$cChunk = $subChunk . $subDelimiter;
$cLength = $chunkLength + $subDelimiterLength;
$lDelim = $subDelimiter;
}
}
}
}
if ($cChunk !== '') {
$chunks[] = trim($cChunk, $lDelim);
}
return $chunks;
}
}

View File

@@ -0,0 +1,47 @@
<?php
namespace Search;
use BookStack\Search\Vectors\TextChunker;
use Tests\TestCase;
class TextChunkerTest extends TestCase
{
public function test_it_chunks_text()
{
$chunker = new TextChunker(3, []);
$chunks = $chunker->chunk('123456789');
$this->assertEquals(['123', '456', '789'], $chunks);
}
public function test_chunk_size_must_be_greater_than_zero()
{
$this->expectException(\InvalidArgumentException::class);
$chunker = new TextChunker(-5, []);
}
public function test_it_works_through_given_delimiters()
{
$chunker = new TextChunker(5, ['-', '.', '']);
$chunks = $chunker->chunk('12-3456.789abcdefg');
$this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks);
}
public function test_it_attempts_to_pack_chunks()
{
$chunker = new TextChunker(8, [' ', '']);
$chunks = $chunker->chunk('123 456 789 abc def');
$this->assertEquals(['123 456', '789 abc', 'def'], $chunks);
}
public function test_it_attempts_to_pack_using_subchunks()
{
$chunker = new TextChunker(8, [' ', '-', '']);
$chunks = $chunker->chunk('123 456-789abc');
$this->assertEquals(['123 456', '789abc'], $chunks);
}
}