mirror of
https://github.com/BookStackApp/BookStack.git
synced 2025-10-22 07:52:19 +03:00
Improved vector text chunking
This commit is contained in:
@@ -141,8 +141,12 @@ class SearchController extends Controller
|
|||||||
return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
|
return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform a vector/LLM-based query search.
|
||||||
|
*/
|
||||||
public function searchQuery(Request $request, VectorSearchRunner $runner)
|
public function searchQuery(Request $request, VectorSearchRunner $runner)
|
||||||
{
|
{
|
||||||
|
// TODO - Validate if query system is active
|
||||||
$query = $request->get('query', '');
|
$query = $request->get('query', '');
|
||||||
|
|
||||||
if ($query) {
|
if ($query) {
|
||||||
|
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
namespace BookStack\Search\Vectors;
|
namespace BookStack\Search\Vectors;
|
||||||
|
|
||||||
|
use BookStack\Activity\Models\Tag;
|
||||||
use BookStack\Entities\Models\Entity;
|
use BookStack\Entities\Models\Entity;
|
||||||
use BookStack\Search\Vectors\Services\VectorQueryService;
|
use BookStack\Search\Vectors\Services\VectorQueryService;
|
||||||
use Illuminate\Support\Facades\DB;
|
use Illuminate\Support\Facades\DB;
|
||||||
@@ -47,8 +48,10 @@ class EntityVectorGenerator
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - Chunk inserts
|
$chunks = array_chunk($toInsert, 500);
|
||||||
SearchVector::query()->insert($toInsert);
|
foreach ($chunks as $chunk) {
|
||||||
|
SearchVector::query()->insert($chunk);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -69,16 +72,16 @@ class EntityVectorGenerator
|
|||||||
*/
|
*/
|
||||||
protected function chunkText(string $text): array
|
protected function chunkText(string $text): array
|
||||||
{
|
{
|
||||||
// TODO - Join adjacent smaller chunks up
|
return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text);
|
||||||
return array_filter(array_map(function (string $section): string {
|
|
||||||
return trim($section);
|
|
||||||
}, explode("\n", $text)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function entityToPlainText(Entity $entity): string
|
protected function entityToPlainText(Entity $entity): string
|
||||||
{
|
{
|
||||||
$text = $entity->name . "\n\n" . $entity->{$entity->textField};
|
$tags = $entity->tags()->get();
|
||||||
// TODO - Add tags
|
$tagText = $tags->map(function (Tag $tag) {
|
||||||
return $text;
|
return $tag->name . ': ' . $tag->value;
|
||||||
|
})->join('\n');
|
||||||
|
|
||||||
|
return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
77
app/Search/Vectors/TextChunker.php
Normal file
77
app/Search/Vectors/TextChunker.php
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace BookStack\Search\Vectors;
|
||||||
|
|
||||||
|
use InvalidArgumentException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits a given string into smaller chunks based on specified delimiters
|
||||||
|
* and a predefined maximum chunk size. This will work through the given delimiters
|
||||||
|
* to break down text further and further to fit into the chunk size.
|
||||||
|
*
|
||||||
|
* The last delimiter is always an empty string to ensure text can always be broken down.
|
||||||
|
*/
|
||||||
|
class TextChunker
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
protected int $chunkSize,
|
||||||
|
protected array $delimiterOrder,
|
||||||
|
) {
|
||||||
|
if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
|
||||||
|
$this->delimiterOrder[] = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($this->chunkSize < 1) {
|
||||||
|
throw new InvalidArgumentException('Chunk size must be greater than 0');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function chunk(string $text): array
|
||||||
|
{
|
||||||
|
$delimiter = $this->delimiterOrder[0];
|
||||||
|
$delimiterLength = strlen($delimiter);
|
||||||
|
$lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
|
||||||
|
|
||||||
|
$cChunk = ''; // Current chunk
|
||||||
|
$cLength = 0; // Current chunk length
|
||||||
|
$chunks = []; // Chunks to return
|
||||||
|
$lDelim = ''; // Last delimiter
|
||||||
|
|
||||||
|
foreach ($lines as $index => $line) {
|
||||||
|
$lineLength = strlen($line);
|
||||||
|
if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
|
||||||
|
$cChunk .= $line . $delimiter;
|
||||||
|
$cLength += $lineLength + $delimiterLength;
|
||||||
|
$lDelim = $delimiter;
|
||||||
|
} else if ($lineLength <= $this->chunkSize) {
|
||||||
|
$chunks[] = trim($cChunk, $delimiter);
|
||||||
|
$cChunk = $line . $delimiter;
|
||||||
|
$cLength = $lineLength + $delimiterLength;
|
||||||
|
$lDelim = $delimiter;
|
||||||
|
} else {
|
||||||
|
$subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
|
||||||
|
$subDelimiter = $this->delimiterOrder[1] ?? '';
|
||||||
|
$subDelimiterLength = strlen($subDelimiter);
|
||||||
|
foreach ($subChunks->chunk($line) as $subChunk) {
|
||||||
|
$chunkLength = strlen($subChunk);
|
||||||
|
if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
|
||||||
|
$cChunk .= $subChunk . $subDelimiter;
|
||||||
|
$cLength += $chunkLength + $subDelimiterLength;
|
||||||
|
$lDelim = $subDelimiter;
|
||||||
|
} else {
|
||||||
|
$chunks[] = trim($cChunk, $lDelim);
|
||||||
|
$cChunk = $subChunk . $subDelimiter;
|
||||||
|
$cLength = $chunkLength + $subDelimiterLength;
|
||||||
|
$lDelim = $subDelimiter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($cChunk !== '') {
|
||||||
|
$chunks[] = trim($cChunk, $lDelim);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $chunks;
|
||||||
|
}
|
||||||
|
}
|
47
tests/Search/TextChunkerTest.php
Normal file
47
tests/Search/TextChunkerTest.php
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Search;
|
||||||
|
|
||||||
|
use BookStack\Search\Vectors\TextChunker;
|
||||||
|
use Tests\TestCase;
|
||||||
|
|
||||||
|
class TextChunkerTest extends TestCase
|
||||||
|
{
|
||||||
|
public function test_it_chunks_text()
|
||||||
|
{
|
||||||
|
$chunker = new TextChunker(3, []);
|
||||||
|
$chunks = $chunker->chunk('123456789');
|
||||||
|
|
||||||
|
$this->assertEquals(['123', '456', '789'], $chunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_chunk_size_must_be_greater_than_zero()
|
||||||
|
{
|
||||||
|
$this->expectException(\InvalidArgumentException::class);
|
||||||
|
$chunker = new TextChunker(-5, []);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_it_works_through_given_delimiters()
|
||||||
|
{
|
||||||
|
$chunker = new TextChunker(5, ['-', '.', '']);
|
||||||
|
$chunks = $chunker->chunk('12-3456.789abcdefg');
|
||||||
|
|
||||||
|
$this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_it_attempts_to_pack_chunks()
|
||||||
|
{
|
||||||
|
$chunker = new TextChunker(8, [' ', '']);
|
||||||
|
$chunks = $chunker->chunk('123 456 789 abc def');
|
||||||
|
|
||||||
|
$this->assertEquals(['123 456', '789 abc', 'def'], $chunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_it_attempts_to_pack_using_subchunks()
|
||||||
|
{
|
||||||
|
$chunker = new TextChunker(8, [' ', '-', '']);
|
||||||
|
$chunks = $chunker->chunk('123 456-789abc');
|
||||||
|
|
||||||
|
$this->assertEquals(['123 456', '789abc'], $chunks);
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user