Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector Search & LLM Integration #5552

Open
wants to merge 3 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions app/Config/services.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@
// Callback URL for social authentication methods
'callback_url' => env('APP_URL', false),

// LLM Service
// Options: openai
'llm' => env('LLM_SERVICE', ''),

// OpenAI API-compatible service details
'openai' => [
'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'),
'key' => env('OPENAI_KEY', ''),
'embedding_model' => env('OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'),
'query_model' => env('OPENAI_QUERY_MODEL', 'gpt-4o'),
],

'github' => [
'client_id' => env('GITHUB_APP_ID', false),
'client_secret' => env('GITHUB_APP_SECRET', false),
Expand Down
46 changes: 46 additions & 0 deletions app/Console/Commands/RegenerateVectorsCommand.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?php

namespace BookStack\Console\Commands;

use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Search\Vectors\SearchVector;
use BookStack\Search\Vectors\StoreEntityVectorsJob;
use Illuminate\Console\Command;

class RegenerateVectorsCommand extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'bookstack:regenerate-vectors';

/**
* The console command description.
*
* @var string
*/
protected $description = 'Re-index vectors for all content in the system';

/**
* Execute the console command.
*/
public function handle(EntityProvider $entityProvider)
{
// TODO - Add confirmation before run regarding deletion/time/effort/api-cost etc...
SearchVector::query()->delete();

$types = $entityProvider->all();
foreach ($types as $type => $typeInstance) {
$this->info("Creating jobs to store vectors for {$type} data...");
/** @var Entity[] $entities */
$typeInstance->newQuery()->chunkById(100, function ($entities) {
foreach ($entities as $entity) {
dispatch(new StoreEntityVectorsJob($entity));
}
});
}
}
}
16 changes: 16 additions & 0 deletions app/Search/SearchController.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use BookStack\Entities\Queries\QueryPopular;
use BookStack\Entities\Tools\SiblingFetcher;
use BookStack\Http\Controller;
use BookStack\Search\Vectors\VectorSearchRunner;
use Illuminate\Http\Request;

class SearchController extends Controller
Expand Down Expand Up @@ -139,4 +140,19 @@ public function searchSiblings(Request $request, SiblingFetcher $siblingFetcher)

return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
}

public function searchQuery(Request $request, VectorSearchRunner $runner)
{
$query = $request->get('query', '');

if ($query) {
$results = $runner->run($query);
} else {
$results = null;
}

return view('search.query', [
'results' => $results,
]);
}
}
14 changes: 13 additions & 1 deletion app/Search/SearchIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Search\Vectors\StoreEntityVectorsJob;
use BookStack\Search\Vectors\VectorQueryServiceProvider;
use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
Expand All @@ -25,7 +27,7 @@ class SearchIndex
public static string $softDelimiters = ".-";

public function __construct(
protected EntityProvider $entityProvider
protected EntityProvider $entityProvider,
) {
}

Expand All @@ -37,6 +39,10 @@ public function indexEntity(Entity $entity): void
$this->deleteEntityTerms($entity);
$terms = $this->entityToTermDataArray($entity);
$this->insertTerms($terms);

if (VectorQueryServiceProvider::isEnabled()) {
dispatch(new StoreEntityVectorsJob($entity));
}
}

/**
Expand All @@ -47,9 +53,15 @@ public function indexEntity(Entity $entity): void
public function indexEntities(array $entities): void
{
$terms = [];
$vectorQueryEnabled = VectorQueryServiceProvider::isEnabled();

foreach ($entities as $entity) {
$entityTerms = $this->entityToTermDataArray($entity);
array_push($terms, ...$entityTerms);

if ($vectorQueryEnabled) {
dispatch(new StoreEntityVectorsJob($entity));
}
}

$this->insertTerms($terms);
Expand Down
84 changes: 84 additions & 0 deletions app/Search/Vectors/EntityVectorGenerator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
<?php

namespace BookStack\Search\Vectors;

use BookStack\Entities\Models\Entity;
use BookStack\Search\Vectors\Services\VectorQueryService;
use Illuminate\Support\Facades\DB;

class EntityVectorGenerator
{
public function __construct(
protected VectorQueryServiceProvider $vectorQueryServiceProvider
) {
}

public function generateAndStore(Entity $entity): void
{
$vectorService = $this->vectorQueryServiceProvider->get();

$text = $this->entityToPlainText($entity);
$chunks = $this->chunkText($text);
$embeddings = $this->chunksToEmbeddings($chunks, $vectorService);

$this->deleteExistingEmbeddingsForEntity($entity);
$this->storeEmbeddings($embeddings, $chunks, $entity);
}

protected function deleteExistingEmbeddingsForEntity(Entity $entity): void
{
SearchVector::query()
->where('entity_type', '=', $entity->getMorphClass())
->where('entity_id', '=', $entity->id)
->delete();
}

protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $entity): void
{
$toInsert = [];

foreach ($embeddings as $index => $embedding) {
$text = $textChunks[$index];
$toInsert[] = [
'entity_id' => $entity->id,
'entity_type' => $entity->getMorphClass(),
'embedding' => DB::raw('VEC_FROMTEXT("[' . implode(',', $embedding) . ']")'),
'text' => $text,
];
}

// TODO - Chunk inserts
SearchVector::query()->insert($toInsert);
}

/**
* @param string[] $chunks
* @return float[] array
*/
protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQueryService): array
{
$embeddings = [];
foreach ($chunks as $index => $chunk) {
$embeddings[$index] = $vectorQueryService->generateEmbeddings($chunk);
}
return $embeddings;
}

/**
* @return string[]
*/
protected function chunkText(string $text): array
{
// TODO - Join adjacent smaller chunks up
return array_filter(array_map(function (string $section): string {
return trim($section);
}, explode("\n", $text)));
}

protected function entityToPlainText(Entity $entity): string
{
$text = $entity->name . "\n\n" . $entity->{$entity->textField};
// TODO - Add tags
return $text;
}
}
16 changes: 16 additions & 0 deletions app/Search/Vectors/SearchVector.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

namespace BookStack\Search\Vectors;

use Illuminate\Database\Eloquent\Model;

/**
* @property string $entity_type
* @property int $entity_id
* @property string $text
* @property string $embedding
*/
class SearchVector extends Model
{
public $timestamps = false;
}
66 changes: 66 additions & 0 deletions app/Search/Vectors/Services/OpenAiVectorQueryService.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<?php

namespace BookStack\Search\Vectors\Services;

use BookStack\Http\HttpRequestService;

class OpenAiVectorQueryService implements VectorQueryService
{
protected string $key;
protected string $endpoint;
protected string $embeddingModel;
protected string $queryModel;

public function __construct(
protected array $options,
protected HttpRequestService $http,
) {
// TODO - Some kind of validation of options
$this->key = $this->options['key'] ?? '';
$this->endpoint = $this->options['endpoint'] ?? '';
$this->embeddingModel = $this->options['embedding_model'] ?? '';
$this->queryModel = $this->options['query_model'] ?? '';
}

protected function jsonRequest(string $method, string $uri, array $data): array
{
$fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
$client = $this->http->buildClient(30);
$request = $this->http->jsonRequest($method, $fullUrl, $data)
->withHeader('Authorization', 'Bearer ' . $this->key);

$response = $client->sendRequest($request);
return json_decode($response->getBody()->getContents(), true);
}

public function generateEmbeddings(string $text): array
{
$response = $this->jsonRequest('POST', 'v1/embeddings', [
'input' => $text,
'model' => $this->embeddingModel,
]);

return $response['data'][0]['embedding'];
}

public function query(string $input, array $context): string
{
$formattedContext = implode("\n", $context);

$response = $this->jsonRequest('POST', 'v1/chat/completions', [
'model' => $this->queryModel,
'messages' => [
[
'role' => 'developer',
'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response. Don\'t try to converse or continue the conversation.'
],
[
'role' => 'user',
'content' => "Provide a response to the below given QUERY using the below given CONTEXT. The CONTEXT is split into parts via lines. Ignore any nonsensical lines of CONTEXT.\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}",
]
],
]);

return $response['choices'][0]['message']['content'] ?? '';
}
}
21 changes: 21 additions & 0 deletions app/Search/Vectors/Services/VectorQueryService.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?php

namespace BookStack\Search\Vectors\Services;

interface VectorQueryService
{
/**
* Generate embedding vectors from the given chunk of text.
* @return float[]
*/
public function generateEmbeddings(string $text): array;

/**
* Query the LLM service using the given user input, and
* relevant context text retrieved locally via a vector search.
* Returns the response output text from the LLM.
*
* @param string[] $context
*/
public function query(string $input, array $context): string;
}
28 changes: 28 additions & 0 deletions app/Search/Vectors/StoreEntityVectorsJob.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?php

namespace BookStack\Search\Vectors;

use BookStack\Entities\Models\Entity;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;

class StoreEntityVectorsJob implements ShouldQueue
{
use Queueable;

/**
* Create a new job instance.
*/
public function __construct(
protected Entity $entity
) {
}

/**
* Execute the job.
*/
public function handle(EntityVectorGenerator $generator): void
{
$generator->generateAndStore($this->entity);
}
}
36 changes: 36 additions & 0 deletions app/Search/Vectors/VectorQueryServiceProvider.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace BookStack\Search\Vectors;

use BookStack\Http\HttpRequestService;
use BookStack\Search\Vectors\Services\OpenAiVectorQueryService;
use BookStack\Search\Vectors\Services\VectorQueryService;

class VectorQueryServiceProvider
{
public function __construct(
protected HttpRequestService $http,
) {
}

public function get(): VectorQueryService
{
$service = $this->getServiceName();

if ($service === 'openai') {
return new OpenAiVectorQueryService(config('services.openai'), $this->http);
}

throw new \Exception("No '{$service}' LLM service found");
}

protected static function getServiceName(): string
{
return strtolower(config('services.llm'));
}

public static function isEnabled(): bool
{
return !empty(static::getServiceName());
}
}
Loading
Loading