Skip to content

Commit

Permalink
refactor: split Document into TextDocument and VectorDocument
Browse files Browse the repository at this point in the history
  • Loading branch information
chr-hertel committed Oct 3, 2024
1 parent 2552463 commit 5aab0d7
Show file tree
Hide file tree
Showing 15 changed files with 85 additions and 116 deletions.
4 changes: 2 additions & 2 deletions examples/store-mongodb-similarity-search.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

use MongoDB\Client as MongoDBClient;
use PhpLlm\LlmChain\Chain;
use PhpLlm\LlmChain\Document\Document;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\TextDocument;
use PhpLlm\LlmChain\DocumentEmbedder;
use PhpLlm\LlmChain\Message\Message;
use PhpLlm\LlmChain\Message\MessageBag;
Expand Down Expand Up @@ -46,7 +46,7 @@

// create embeddings and documents
foreach ($movies as $movie) {
$documents[] = new Document(
$documents[] = new TextDocument(
id: Uuid::v4(),
text: $movie['title'].' '.$movie['description'],
metadata: new Metadata($movie),
Expand Down
4 changes: 2 additions & 2 deletions examples/store-pinecone-similarity-search.php
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
<?php

use PhpLlm\LlmChain\Chain;
use PhpLlm\LlmChain\Document\Document;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\TextDocument;
use PhpLlm\LlmChain\DocumentEmbedder;
use PhpLlm\LlmChain\Message\Message;
use PhpLlm\LlmChain\Message\MessageBag;
Expand Down Expand Up @@ -40,7 +40,7 @@

// create embeddings and documents
foreach ($movies as $movie) {
$documents[] = new Document(
$documents[] = new TextDocument(
id: Uuid::v4(),
text: 'Title: '.$movie['title'].PHP_EOL.'Director: '.$movie['director'].PHP_EOL.$movie['description'],
metadata: new Metadata($movie),
Expand Down
19 changes: 0 additions & 19 deletions src/Document/EmbeddedDocument.php

This file was deleted.

12 changes: 1 addition & 11 deletions src/Document/Document.php → src/Document/TextDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
use Symfony\Component\Uid\Uuid;
use Webmozart\Assert\Assert;

readonly class Document
readonly class TextDocument
{
public function __construct(
public Uuid $id,
Expand All @@ -16,14 +16,4 @@ public function __construct(
) {
Assert::stringNotEmpty(trim($this->text));
}

public function withVector(Vector $vector): EmbeddedDocument
{
return new EmbeddedDocument(
$this->id,
$this->text,
$vector,
$this->metadata,
);
}
}
17 changes: 17 additions & 0 deletions src/Document/VectorDocument.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?php

declare(strict_types=1);

namespace PhpLlm\LlmChain\Document;

use Symfony\Component\Uid\Uuid;

final readonly class VectorDocument
{
public function __construct(
public Uuid $id,
public Vector $vector,
public Metadata $metadata = new Metadata([]),
) {
}
}
17 changes: 9 additions & 8 deletions src/DocumentEmbedder.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

namespace PhpLlm\LlmChain;

use PhpLlm\LlmChain\Document\Document;
use PhpLlm\LlmChain\Document\TextDocument;
use PhpLlm\LlmChain\Document\VectorDocument;
use PhpLlm\LlmChain\Store\StoreInterface;
use Psr\Log\LoggerInterface;
use Psr\Log\NullLogger;
Expand All @@ -25,11 +26,11 @@ public function __construct(
}

/**
* @param Document|list<Document> $documents
* @param TextDocument|TextDocument[] $documents
*/
public function embed(Document|array $documents, int $chunkSize = 0, int $sleep = 0): void
public function embed(TextDocument|array $documents, int $chunkSize = 0, int $sleep = 0): void
{
if ($documents instanceof Document) {
if ($documents instanceof TextDocument) {
$documents = [$documents];
}

Expand All @@ -42,14 +43,14 @@ public function embed(Document|array $documents, int $chunkSize = 0, int $sleep
$chunks = 0 !== $chunkSize ? array_chunk($documents, $chunkSize) : [$documents];

foreach ($chunks as $chunk) {
$vectors = $this->embeddings->multiCreate(array_map(fn (Document $document) => $document->text, $chunk));
$vectors = $this->embeddings->multiCreate(array_map(fn (TextDocument $document) => $document->text, $chunk));

$embeddedDocuments = [];
$vectorDocuments = [];
foreach ($chunk as $i => $document) {
$embeddedDocuments[] = $document->withVector($vectors[$i]);
$vectorDocuments[] = new VectorDocument($document->id, $vectors[$i], $document->metadata);
}

$this->store->addDocuments($embeddedDocuments);
$this->store->add($vectorDocuments);

if (0 !== $sleep) {
$this->clock->sleep($sleep);
Expand Down
22 changes: 8 additions & 14 deletions src/Store/Azure/SearchStore.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

namespace PhpLlm\LlmChain\Store\Azure;

use PhpLlm\LlmChain\Document\EmbeddedDocument;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Document\VectorDocument;
use PhpLlm\LlmChain\Store\VectorStoreInterface;
use Symfony\Component\Uid\Uuid;
use Symfony\Contracts\HttpClient\HttpClientInterface;
Expand All @@ -26,21 +26,17 @@ public function __construct(
) {
}

public function addDocument(EmbeddedDocument $document): void
public function add(array|VectorDocument $documents): void
{
$this->addDocuments([$document]);
}
if ($documents instanceof VectorDocument) {
$documents = [$documents];
}

public function addDocuments(array $documents): void
{
$this->request('index', [
'value' => array_map([$this, 'convertDocumentToIndexableArray'], $documents),
]);
}

/**
* @return list<EmbeddedDocument>
*/
public function query(Vector $vector, array $options = []): array
{
$result = $this->request('search', [
Expand Down Expand Up @@ -73,23 +69,21 @@ private function request(string $endpoint, array $payload): array
/**
* @return array<string, mixed>
*/
private function convertDocumentToIndexableArray(EmbeddedDocument $document): array
private function convertDocumentToIndexableArray(VectorDocument $document): array
{
return array_merge([
'id' => $document->id,
$this->vectorFieldName => $document->vector->getData(),
'text' => $document->text,
], $document->metadata->getArrayCopy());
}

/**
* @param array<string, mixed> $data
*/
private function convertArrayToDocument(array $data): EmbeddedDocument
private function convertArrayToDocument(array $data): VectorDocument
{
return new EmbeddedDocument(
return new VectorDocument(
id: Uuid::fromString($data['id']),
text: $data['text'],
vector: $data[$this->vectorFieldName] ? new Vector($data[$this->vectorFieldName]) : null,
metadata: new Metadata($data),
);
Expand Down
14 changes: 6 additions & 8 deletions src/Store/ChromaDB/Store.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
namespace PhpLlm\LlmChain\Store\ChromaDB;

use Codewithkyrian\ChromaDB\Client;
use PhpLlm\LlmChain\Document\EmbeddedDocument;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Document\VectorDocument;
use PhpLlm\LlmChain\Store\VectorStoreInterface;
use Symfony\Component\Uid\Uuid;

Expand All @@ -19,13 +19,12 @@ public function __construct(
) {
}

public function addDocument(EmbeddedDocument $document): void
public function add(array|VectorDocument $documents): void
{
$this->addDocuments([$document]);
}
if ($documents instanceof VectorDocument) {
$documents = [$documents];
}

public function addDocuments(array $documents): void
{
$ids = [];
$vectors = [];
$metadata = [];
Expand All @@ -49,9 +48,8 @@ public function query(Vector $vector, array $options = []): array

$documents = [];
for ($i = 0; $i < count($queryResponse->metadatas[0]); ++$i) {
$documents[] = new EmbeddedDocument(
$documents[] = new VectorDocument(
id: Uuid::fromString($queryResponse->ids[0][$i]),
text: '???',
vector: new Vector($queryResponse->embeddings[0][$i]),
metadata: new Metadata($queryResponse->metadatas[0][$i]),
);
Expand Down
17 changes: 6 additions & 11 deletions src/Store/MongoDB/Store.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
use MongoDB\Client;
use MongoDB\Collection;
use MongoDB\Driver\Exception\CommandException;
use PhpLlm\LlmChain\Document\EmbeddedDocument;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Document\VectorDocument;
use PhpLlm\LlmChain\Exception\InvalidArgumentException;
use PhpLlm\LlmChain\Store\InitializableStoreInterface;
use PhpLlm\LlmChain\Store\VectorStoreInterface;
Expand Down Expand Up @@ -61,13 +61,12 @@ public function __construct(
) {
}

public function addDocument(EmbeddedDocument $document): void
public function add(array|VectorDocument $documents): void
{
$this->addDocuments([$document]);
}
if ($documents instanceof VectorDocument) {
$documents = [$documents];
}

public function addDocuments(array $documents): void
{
$operations = [];

foreach ($documents as $document) {
Expand All @@ -76,7 +75,6 @@ public function addDocuments(array $documents): void
array_filter([
'metadata' => $document->metadata->getArrayCopy(),
$this->vectorFieldName => $document->vector->getData(),
'text' => $document->text,
]),
['upsert' => true], // insert if not exists
];
Expand All @@ -100,8 +98,6 @@ public function addDocuments(array $documents): void
* numCandidates?: positive-int,
* filter?: array<mixed>
* } $options
*
* @return EmbeddedDocument[]
*/
public function query(Vector $vector, array $options = []): array
{
Expand All @@ -125,9 +121,8 @@ public function query(Vector $vector, array $options = []): array
$documents = [];

foreach ($results as $result) {
$documents[] = new EmbeddedDocument(
$documents[] = new VectorDocument(
id: $this->toUuid($result['_id']),
text: $result['text'],
vector: new Vector($result[$this->vectorFieldName]),
metadata: new Metadata($result['metadata'] ?? []),
);
Expand Down
15 changes: 6 additions & 9 deletions src/Store/Pinecone/Store.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

namespace PhpLlm\LlmChain\Store\Pinecone;

use PhpLlm\LlmChain\Document\EmbeddedDocument;
use PhpLlm\LlmChain\Document\Metadata;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Document\VectorDocument;
use PhpLlm\LlmChain\Store\VectorStoreInterface;
use Probots\Pinecone\Client;
use Probots\Pinecone\Resources\Data\VectorResource;
Expand All @@ -25,19 +25,17 @@ public function __construct(
) {
}

public function addDocument(EmbeddedDocument $document): void
public function add(array|VectorDocument $documents): void
{
$this->addDocuments([$document]);
}
if ($documents instanceof VectorDocument) {
$documents = [$documents];
}

public function addDocuments(array $documents): void
{
$vectors = [];
foreach ($documents as $document) {
$vectors[] = [
'id' => (string) $document->id,
'values' => $document->vector->getData(),
'text' => $document->text,
'metadata' => $document->metadata->getArrayCopy(),
];
}
Expand All @@ -61,9 +59,8 @@ public function query(Vector $vector, array $options = []): array

$documents = [];
foreach ($response->json()['matches'] as $match) {
$documents[] = new EmbeddedDocument(
$documents[] = new VectorDocument(
id: Uuid::fromString($match['id']),
text: $match['text'],
vector: new Vector($match['values']),
metadata: new Metadata($match['metadata']),
);
Expand Down
8 changes: 3 additions & 5 deletions src/Store/StoreInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@

namespace PhpLlm\LlmChain\Store;

use PhpLlm\LlmChain\Document\EmbeddedDocument;
use PhpLlm\LlmChain\Document\VectorDocument;

interface StoreInterface
{
public function addDocument(EmbeddedDocument $document): void;

/**
* @param EmbeddedDocument[] $documents
* @param VectorDocument|VectorDocument[] $documents
*/
public function addDocuments(array $documents): void;
public function add(VectorDocument|array $documents): void;
}
4 changes: 2 additions & 2 deletions src/Store/VectorStoreInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@

namespace PhpLlm\LlmChain\Store;

use PhpLlm\LlmChain\Document\EmbeddedDocument;
use PhpLlm\LlmChain\Document\Vector;
use PhpLlm\LlmChain\Document\VectorDocument;

interface VectorStoreInterface extends StoreInterface
{
/**
* @param array<string, mixed> $options
*
* @return EmbeddedDocument[]
* @return VectorDocument[]
*/
public function query(Vector $vector, array $options = []): array;
}
Loading

0 comments on commit 5aab0d7

Please sign in to comment.