mirror of
https://github.com/BookStackApp/BookStack.git
synced 2025-10-22 07:52:19 +03:00
Improved vector text chunking
This commit is contained in:
@@ -141,8 +141,12 @@ class SearchController extends Controller
|
||||
return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a vector/LLM-based query search.
|
||||
*/
|
||||
public function searchQuery(Request $request, VectorSearchRunner $runner)
|
||||
{
|
||||
// TODO - Validate if query system is active
|
||||
$query = $request->get('query', '');
|
||||
|
||||
if ($query) {
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
namespace BookStack\Search\Vectors;
|
||||
|
||||
use BookStack\Activity\Models\Tag;
|
||||
use BookStack\Entities\Models\Entity;
|
||||
use BookStack\Search\Vectors\Services\VectorQueryService;
|
||||
use Illuminate\Support\Facades\DB;
|
||||
@@ -47,8 +48,10 @@ class EntityVectorGenerator
|
||||
];
|
||||
}
|
||||
|
||||
// TODO - Chunk inserts
|
||||
SearchVector::query()->insert($toInsert);
|
||||
$chunks = array_chunk($toInsert, 500);
|
||||
foreach ($chunks as $chunk) {
|
||||
SearchVector::query()->insert($chunk);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -69,16 +72,16 @@ class EntityVectorGenerator
|
||||
*/
|
||||
protected function chunkText(string $text): array
|
||||
{
|
||||
// TODO - Join adjacent smaller chunks up
|
||||
return array_filter(array_map(function (string $section): string {
|
||||
return trim($section);
|
||||
}, explode("\n", $text)));
|
||||
return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text);
|
||||
}
|
||||
|
||||
protected function entityToPlainText(Entity $entity): string
|
||||
{
|
||||
$text = $entity->name . "\n\n" . $entity->{$entity->textField};
|
||||
// TODO - Add tags
|
||||
return $text;
|
||||
$tags = $entity->tags()->get();
|
||||
$tagText = $tags->map(function (Tag $tag) {
|
||||
return $tag->name . ': ' . $tag->value;
|
||||
})->join('\n');
|
||||
|
||||
return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField};
|
||||
}
|
||||
}
|
||||
|
77
app/Search/Vectors/TextChunker.php
Normal file
77
app/Search/Vectors/TextChunker.php
Normal file
@@ -0,0 +1,77 @@
|
||||
<?php
|
||||
|
||||
namespace BookStack\Search\Vectors;
|
||||
|
||||
use InvalidArgumentException;
|
||||
|
||||
/**
|
||||
* Splits a given string into smaller chunks based on specified delimiters
|
||||
* and a predefined maximum chunk size. This will work through the given delimiters
|
||||
* to break down text further and further to fit into the chunk size.
|
||||
*
|
||||
* The last delimiter is always an empty string to ensure text can always be broken down.
|
||||
*/
|
||||
class TextChunker
|
||||
{
|
||||
public function __construct(
|
||||
protected int $chunkSize,
|
||||
protected array $delimiterOrder,
|
||||
) {
|
||||
if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
|
||||
$this->delimiterOrder[] = '';
|
||||
}
|
||||
|
||||
if ($this->chunkSize < 1) {
|
||||
throw new InvalidArgumentException('Chunk size must be greater than 0');
|
||||
}
|
||||
}
|
||||
|
||||
public function chunk(string $text): array
|
||||
{
|
||||
$delimiter = $this->delimiterOrder[0];
|
||||
$delimiterLength = strlen($delimiter);
|
||||
$lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
|
||||
|
||||
$cChunk = ''; // Current chunk
|
||||
$cLength = 0; // Current chunk length
|
||||
$chunks = []; // Chunks to return
|
||||
$lDelim = ''; // Last delimiter
|
||||
|
||||
foreach ($lines as $index => $line) {
|
||||
$lineLength = strlen($line);
|
||||
if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
|
||||
$cChunk .= $line . $delimiter;
|
||||
$cLength += $lineLength + $delimiterLength;
|
||||
$lDelim = $delimiter;
|
||||
} else if ($lineLength <= $this->chunkSize) {
|
||||
$chunks[] = trim($cChunk, $delimiter);
|
||||
$cChunk = $line . $delimiter;
|
||||
$cLength = $lineLength + $delimiterLength;
|
||||
$lDelim = $delimiter;
|
||||
} else {
|
||||
$subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
|
||||
$subDelimiter = $this->delimiterOrder[1] ?? '';
|
||||
$subDelimiterLength = strlen($subDelimiter);
|
||||
foreach ($subChunks->chunk($line) as $subChunk) {
|
||||
$chunkLength = strlen($subChunk);
|
||||
if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
|
||||
$cChunk .= $subChunk . $subDelimiter;
|
||||
$cLength += $chunkLength + $subDelimiterLength;
|
||||
$lDelim = $subDelimiter;
|
||||
} else {
|
||||
$chunks[] = trim($cChunk, $lDelim);
|
||||
$cChunk = $subChunk . $subDelimiter;
|
||||
$cLength = $chunkLength + $subDelimiterLength;
|
||||
$lDelim = $subDelimiter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($cChunk !== '') {
|
||||
$chunks[] = trim($cChunk, $lDelim);
|
||||
}
|
||||
|
||||
return $chunks;
|
||||
}
|
||||
}
|
47
tests/Search/TextChunkerTest.php
Normal file
47
tests/Search/TextChunkerTest.php
Normal file
@@ -0,0 +1,47 @@
|
||||
<?php
|
||||
|
||||
namespace Search;
|
||||
|
||||
use BookStack\Search\Vectors\TextChunker;
|
||||
use Tests\TestCase;
|
||||
|
||||
class TextChunkerTest extends TestCase
|
||||
{
|
||||
public function test_it_chunks_text()
|
||||
{
|
||||
$chunker = new TextChunker(3, []);
|
||||
$chunks = $chunker->chunk('123456789');
|
||||
|
||||
$this->assertEquals(['123', '456', '789'], $chunks);
|
||||
}
|
||||
|
||||
public function test_chunk_size_must_be_greater_than_zero()
|
||||
{
|
||||
$this->expectException(\InvalidArgumentException::class);
|
||||
$chunker = new TextChunker(-5, []);
|
||||
}
|
||||
|
||||
public function test_it_works_through_given_delimiters()
|
||||
{
|
||||
$chunker = new TextChunker(5, ['-', '.', '']);
|
||||
$chunks = $chunker->chunk('12-3456.789abcdefg');
|
||||
|
||||
$this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks);
|
||||
}
|
||||
|
||||
public function test_it_attempts_to_pack_chunks()
|
||||
{
|
||||
$chunker = new TextChunker(8, [' ', '']);
|
||||
$chunks = $chunker->chunk('123 456 789 abc def');
|
||||
|
||||
$this->assertEquals(['123 456', '789 abc', 'def'], $chunks);
|
||||
}
|
||||
|
||||
public function test_it_attempts_to_pack_using_subchunks()
|
||||
{
|
||||
$chunker = new TextChunker(8, [' ', '-', '']);
|
||||
$chunks = $chunker->chunk('123 456-789abc');
|
||||
|
||||
$this->assertEquals(['123 456', '789abc'], $chunks);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user