Skip to content

Commit

Permalink
Add MongoDB vector store
Browse files Browse the repository at this point in the history
  • Loading branch information
OskarStark committed Sep 26, 2024
1 parent 04b4abb commit 420559d
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: pipeline
on: pull_request

env:
REQUIRED_PHP_EXTENSIONS: 'mongodb'

jobs:
tests:
runs-on: ubuntu-latest
Expand All @@ -16,6 +19,8 @@ jobs:
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php }}
coverage: "none"
extensions: "${{ env.REQUIRED_PHP_EXTENSIONS }}"

- name: Install Composer
uses: "ramsey/composer-install@v3"
Expand All @@ -41,6 +46,8 @@ jobs:
uses: shivammathur/setup-php@v2
with:
php-version: '8.2'
coverage: "none"
extensions: "${{ env.REQUIRED_PHP_EXTENSIONS }}"

- name: Install Composer
uses: "ramsey/composer-install@v3"
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Supported Stores

* [x] [ChromaDB](https://trychroma.com)
* [x] [Azure AI Search](https://azure.microsoft.com/en-us/products/ai-services/ai-search)
* [x] [MongoDB Atlas Search](https://mongodb.com/products/platform/atlas-vector-search)
* [ ] [Pinecone](https://pinecone.io)

Provided Tools
Expand Down
2 changes: 2 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
},
"require-dev": {
"codewithkyrian/chromadb-php": "^0.2.1",
"mongodb/mongodb": "^1.19",
"php-cs-fixer/shim": "^3.64",
"phpstan/phpstan": "^1.12",
"phpunit/phpunit": "^11.3",
Expand All @@ -36,6 +37,7 @@
"symfony/var-dumper": "^6.4 || ^7.1"
},
"suggest": {
"mongodb/mongodb": "For using MongoDB Atlas as retrieval vector store.",
"codewithkyrian/chromadb-php": "For using the ChromaDB as retrieval vector store.",
"symfony/clock": "For using the clock tool.",
"symfony/css-selector": "For using the YouTube transcription tool.",
Expand Down
152 changes: 152 additions & 0 deletions src/Store/MongoDB/Store.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Store\MongoDB;

use MongoDB\BSON\Binary;
use MongoDB\Client;
use MongoDB\Collection;
use PhpLlm\LlmChain\Document\Document;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Store\VectorStoreInterface;
use Psr\Log\LoggerInterface;
use Symfony\Component\Uid\Uuid;

/**
* @see https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/
*
* For this store you need to create a separate MongoDB Atlas Search index.
* The index needs to be created with the following settings:
* {
* "fields": [
* {
* "numDimensions": 1536,
* "path": "vector",
* "similarity": "euclidean",
* "type": "vector"
* }
* ]
* }
*
* Note, that the `path` key needs to match the $vectorFieldName.
*
* For the `similarity` key you can choose between `euclidean`, `cosine` and `dotProduct`.
* {@see https://www.mongodb.com/docs/atlas/atlas-search/field-types/knn-vector/#define-the-index-for-the-fts-field-type-type}
*
* @author Oskar Stark <[email protected]>
*/
final readonly class Store implements VectorStoreInterface
{
/**
* @param string $databaseName The name of the database
* @param string $collectionName The name of the collection
* @param string $indexName The name of the Atlas Search index
* @param string $vectorFieldName The name of the field int the index that contains the vector
* @param bool $bulkWrite Use bulk write operations
*/
public function __construct(
private Client $client,
private LoggerInterface $logger,
private string $databaseName,
private string $collectionName,
private string $indexName,
private string $vectorFieldName = 'vector',
private bool $bulkWrite = false,
) {
}

public function addDocument(Document $document): void
{
$this->addDocuments([$document]);
}

public function addDocuments(array $documents): void
{
$operations = [];

foreach ($documents as $document) {
if (!$document->hasVector()) {
$this->logger->warning('Document {id} does not have a vector', ['id' => $document->id]);
}

$operation = [
['_id' => $this->toBinary($document->id)], // we use binary for the id, because of storage efficiency
array_filter([
'metadata' => $document->metadata,
$this->vectorFieldName => $document->vector->getData(),
'text' => $document->text,
]),
['upsert' => true], // insert if not exists
];

if ($this->bulkWrite) {
$operations[] = ['replaceOne' => $operation];
continue;
}

$this->getCollection()->replaceOne(...$operation);
}

if ($this->bulkWrite) {
$this->getCollection()->bulkWrite($operations);
}
}

/**
* @param array{
* limit?: positive-int,
* numCandidates?: positive-int,
* filter?: array<mixed>
* } $options
*
* @return Document[]
*/
public function query(Vector $vector, array $options = []): array
{
$results = $this->getCollection()->aggregate([
[
'$vectorSearch' => array_merge([
'index' => $this->indexName,
'path' => $this->vectorFieldName,
'queryVector' => $vector->getData(),
'numCandidates' => 200,
'limit' => 5,
], $options),
],
[
'$addFields' => [
'score' => ['$meta' => 'vectorSearchScore'],
],
],
], ['typeMap' => ['root' => 'array', 'document' => 'array', 'array' => 'array']]);

$documents = [];

foreach ($results as $result) {
$documents[] = Document::fromVector(
Vector::create1536($result[$this->vectorFieldName]),
$this->toUuid($result['_id']),
new Metadata($result['metadata'] ?? []),
);
}

return $documents;
}

private function getCollection(): Collection
{
return $this->client->selectCollection($this->databaseName, $this->collectionName);
}

private function toBinary(Uuid $uuid): Binary
{
return new Binary($uuid->toBinary(), Binary::TYPE_UUID);
}

private function toUuid(Binary $binary): Uuid
{
return Uuid::fromString($binary->getData());
}
}

0 comments on commit 420559d

Please sign in to comment.