From a2fa8c9e6e0a5ff803f920fb814b6e08225d0fef Mon Sep 17 00:00:00 2001 From: leNEKO Date: Mon, 25 May 2020 15:42:21 +0200 Subject: [PATCH] Aggregator Extractor (#25) * toIterator method * pipeline consume as a Generator * new Accumulator extractor * missingDataExxecption if strict * fun with chaining * better doc * @ArthurHoaro get rid of superfluous if statement * @ArthurHoaro fix type juggling/casting * perf, remove unecessary md5 hash * various cosmetics * Accumulator tests * optimization * documentation * svg schema update * missing link in documentation * svg better * rename to Aggregator * incomplete flag * svg tuning * fix fusion Co-authored-by: Nicolas @ remote --- README.md | 4 +- docs/Extractors/Aggregator.md | 93 ++ docs/Extractors/Collection.md | 8 +- docs/Extractors/Csv.md | 32 +- docs/Extractors/FixedWidth.md | 9 +- docs/Extractors/Json.md | 9 +- docs/Extractors/Query.md | 5 +- docs/Extractors/README.md | 2 +- docs/Extractors/Table.md | 23 +- docs/Extractors/Xml.md | 29 +- docs/README.md | 1 + docs/RunningProcesses.md | 10 +- docs/img/etl.svg | 1203 +++++++++++++++++++++ src/Etl.php | 2 +- src/Exception/IncompleteDataException.php | 15 + src/Exception/InvalidOptionException.php | 15 + src/Exception/UndefinedIndexException.php | 15 + src/Extractors/Aggregator.php | 183 ++++ src/Row.php | 25 + tests/Extractors/AggregatorTest.php | 208 ++++ tests/Usecases/ChainingTest.php | 118 ++ tests/Usecases/data/infos.csv | 4 + tests/Usecases/data/users.csv | 4 + 23 files changed, 1965 insertions(+), 52 deletions(-) create mode 100644 docs/Extractors/Aggregator.md create mode 100644 docs/img/etl.svg create mode 100644 src/Exception/IncompleteDataException.php create mode 100644 src/Exception/InvalidOptionException.php create mode 100644 src/Exception/UndefinedIndexException.php create mode 100644 src/Extractors/Aggregator.php create mode 100644 tests/Extractors/AggregatorTest.php create mode 100644 tests/Usecases/ChainingTest.php create mode 100644 tests/Usecases/data/infos.csv create mode 100644 tests/Usecases/data/users.csv diff --git a/README.md b/README.md index 0bc05c3..ae8755b 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,14 @@ Extract, Transform and Load data using PHP. +![ETL](docs/img/etl.svg) + ## Changelog See the changelog [here](changelog.MD) ## Installation In your application's folder, run: -``` +```shell composer require wizaplace/php-etl ``` diff --git a/docs/Extractors/Aggregator.md b/docs/Extractors/Aggregator.md new file mode 100644 index 0000000..0640dbd --- /dev/null +++ b/docs/Extractors/Aggregator.md @@ -0,0 +1,93 @@ +# Aggregator + +Merge rows from a list of partial data iterators with a matching index. + +```php +# user data from one CSV file +$userDataIterator = (new Etl()) + ->extract( + new Csv(), + 'user_data.csv', + ['columns' => ['id','email', 'name']] + ) + ->toIterator() +; + +# extended info from another source +$extendedInfoIterator = (new Etl()) + ->extract( + new Table(), + 'extended_info', + ['columns' => 'courriel', 'twitter'] + ) + # let's rename 'courriel' to 'email' + ->tranform( + new RenameColumns(), + [ + 'columns' => ['courriel' => 'email'] + ] + ) + ->toIterator() +; + +# merge this two data sources +$mergedData = (new Etl()) + ->extract( + new Aggregator(), + [ + $userDataIterator, + $extendedInfoIterator, + ], + [ + 'index' => ['email'], # common matching index + 'columns' => ['id','email','name','twitter'] + ] + ) + ->load( + new CsvLoader(), + 'completeUserData.csv' + ) + ->run() +; +``` + +## Options + +### Index (required) + +An array of column names common in all data sources + +| Type | Default value | +|-------|---------------| +| array | `null` | + +```php +$options = ['index' => ['email']]; +``` + +### Columns (required) + +A `Row` is yield when all specified columns have been found for the matching index. + +| Type | Default value | +|-------|---------------| +| array | `null` | + +```php +$options = ['columns' => ['id', 'name', 'email']]; +``` + +### Strict + +When all Iterators input are fully consummed, if we have any remaining incomplete rows: + +- if *true*: Throw an `IncompleteDataException` +- if *false*: yield the incomplete remaining `Row` flagged as `incomplete` + +| Type | Default value | +|---------|---------------| +| boolean | `true` | + +```php +$options = ['strict' => false]; +``` \ No newline at end of file diff --git a/docs/Extractors/Collection.md b/docs/Extractors/Collection.md index 035abd4..8b5b76b 100644 --- a/docs/Extractors/Collection.md +++ b/docs/Extractors/Collection.md @@ -9,15 +9,15 @@ $etl->extract($collection, $iterable, $options); > **Tip:** Using generators will decrease memory usage. - ## Options ### Columns + Columns from the iterable item that will be extracted. -| Type | Default value | -|----- | ------------- | -| array | `null` | +| Type | Default value | +|-------|---------------| +| array | `null` | ```php $options = ['columns' => ['id', 'name', 'email']]; diff --git a/docs/Extractors/Csv.md b/docs/Extractors/Csv.md index e590897..ee55a5e 100644 --- a/docs/Extractors/Csv.md +++ b/docs/Extractors/Csv.md @@ -7,22 +7,24 @@ Extracts data from a character-separated values file. $etl->extract($csv, 'path/to/file.csv', $options); ``` - ## Options ### Columns + Columns that will be extracted. If `null`, all columns will be extracted and the first line will be used as the columns names. -| Type | Default value | -|----- | ------------- | -| array | `null` | +| Type | Default value | +|-------|---------------| +| array | `null` | To select which columns will be extracted, use an array with the columns list: + ```php $options = ['columns' => ['id', 'name', 'email']]; ``` To rename the columns, use an associative array where the `key` is the name of the column in the file and the `value` is the name that will be used in the etl process: + ```php $options = ['columns' => [ 'id' => 'id', @@ -32,6 +34,7 @@ $options = ['columns' => [ ``` If your file does not contains the columns names, you may set the name and the index of the columns to extract starting at 1: + ```php $options = ['columns' => [ 'id' => 1, @@ -41,35 +44,38 @@ $options = ['columns' => [ ``` ### Delimiter + Field delimiter (one character only). -| Type | Default value | -|----- | ------------- | -| string | , | +| Type | Default value | +|--------|---------------| +| string | , | ```php $options = ['delimiter' => ';']; ``` ### Enclosure + Field enclosure character (one character only). -| Type | Default value | -|----- | ------------- | -| string | | +| Type | Default value | +|--------|---------------| +| string | | ```php $options = ['enclosure' => '"']; ``` ### Throw error + If the extractor need to throw an exception if it encounters any input issue during the data processing. Default value is set to false to keep backward compatibility. -| Type | Default value | -|----- | ------------- | -| boolean | false | +| Type | Default value | +|---------|---------------| +| boolean | false | ```php $options = ['throwError' => '"']; diff --git a/docs/Extractors/FixedWidth.md b/docs/Extractors/FixedWidth.md index eef7c1a..74658f1 100644 --- a/docs/Extractors/FixedWidth.md +++ b/docs/Extractors/FixedWidth.md @@ -7,17 +7,18 @@ Extracts data from a text file with fields delimited by a fixed number of charac $etl->extract($fixedWidth, 'path/to/file.txt', $options); ``` - ## Options ### Columns (required) + Columns that will be extracted. -| Type | Default value | -|----- | ------------- | -| array | `null` | +| Type | Default value | +|-------|---------------| +| array | `null` | Associative array where the `key` is the name of the column and the `value` is an array containing the start position and the length of the column; + ```php $options = ['columns' => [ 'id' => [0, 5], // Start position is 0 and length is 5. diff --git a/docs/Extractors/Json.md b/docs/Extractors/Json.md index e08caa9..31b3e2a 100644 --- a/docs/Extractors/Json.md +++ b/docs/Extractors/Json.md @@ -7,17 +7,18 @@ Extracts data from a JavaScript Object Notation file. $etl->extract($json, 'path/to/file.json', $options); ``` - ## Options ### Columns + Columns that will be extracted. If `null`, the first level key/value pairs of the object in each iteration will be used. -| Type | Default value | -|----- | ------------- | -| array | `null` | +| Type | Default value | +|-------|---------------| +| array | `null` | For more control over the columns, you may use JSON path: + ```php $options = ['columns' => [ 'id' => '$..bindings[*].id.value', diff --git a/docs/Extractors/Query.md b/docs/Extractors/Query.md index 899765d..25402f0 100644 --- a/docs/Extractors/Query.md +++ b/docs/Extractors/Query.md @@ -7,10 +7,10 @@ Extracts data from a database table using a custom SQL query. $etl->extract($query, 'select * from users', $options); ``` - ## Options ### Connection + Name of the database connection to use. | Type | Default value | @@ -22,6 +22,7 @@ $options = ['connection' => 'app']; ``` ### Bindings + Values to bind to the query statement. | Type | Default value | @@ -29,11 +30,13 @@ Values to bind to the query statement. | array | `[]` | Using prepared statement with named placeholders `select * from users where status = :status`: + ```php $options = ['bindings' => ['status' => 'active']]; ``` Using prepared statement with question mark placeholders `select * from users where status = ?`: + ```php $options = ['bindings' => ['active']]; ``` diff --git a/docs/Extractors/README.md b/docs/Extractors/README.md index a4f176b..c97da69 100644 --- a/docs/Extractors/README.md +++ b/docs/Extractors/README.md @@ -7,9 +7,9 @@ Extractors are the entry point of any process. To start a process, you must set $etl->extract($type, $source, $options); ``` - ## Available extractors types +* [Aggregator](Aggregator.md) * [Collection](Collection.md) * [CSV](Csv.md) * [Fixed Width](FixedWidth.md) diff --git a/docs/Extractors/Table.md b/docs/Extractors/Table.md index 23b85c6..ecbee18 100644 --- a/docs/Extractors/Table.md +++ b/docs/Extractors/Table.md @@ -7,38 +7,41 @@ Extracts data from a database table. $etl->extract($table, 'table_name', $options); ``` - ## Options ### Columns + Columns that will be extracted. If `null`, all columns of the table will be extracted. -| Type | Default value | -|----- | ------------- | -| array | `null` | +| Type | Default value | +|-------|---------------| +| array | `null` | To select which columns will be extracted, use an array with the columns list: + ```php $options = ['columns' => ['id', 'name', 'email']]; ``` ### Connection + Name of the database connection to use. -| Type | Default value | -|----- | ------------- | -| string | default | +| Type | Default value | +|--------|---------------| +| string | default | ```php $options = ['connection' => 'app']; ``` ### Where + Array of conditions, where `key` equals `value`. If you need more flexibility in the the query creation, you may use the [Query extractor](Query.md). -| Type | Default value | -|----- | ------------- | -| array | `[]` | +| Type | Default value | +|-------|---------------| +| array | `[]` | ```php $options = ['where' => ['status' => 'active']]; diff --git a/docs/Extractors/Xml.md b/docs/Extractors/Xml.md index f90e805..bf0756d 100644 --- a/docs/Extractors/Xml.md +++ b/docs/Extractors/Xml.md @@ -7,33 +7,38 @@ Extracts data from an XML file. $etl->extract($xml, 'path/to/file.xml', $options); ``` - ## Options ### Columns + Columns that will be extracted. If `null`, all tags and attributes within the loop path will be extracted. -| Type | Default value | -|----- | ------------- | -| array | `null` | +| Type | Default value | +|-------| ------------- | +| array | `null` | To select which columns will be extracted, use the path (without the loop path) of the value. Use `@` to select attributes: + ```php -$options = ['columns' => [ - 'id' => '/@id', - 'name' => '/profile/name', - 'email' => '/profile/email', -]]; +$options = [ + 'columns' => [ + 'id' => '/@id', + 'name' => '/profile/name', + 'email' => '/profile/email', + ] +]; ``` ### Loop + The path to loop through. -| Type | Default value | -|----- | ------------- | -| string | / | +| Type | Default value | +|--------|---------------| +| string | / | To select which columns will be extracted, use the path (without the loop path) of the value. Use `@` to select attributes: + ```php $options = ['loop' => '/users/user']; ``` diff --git a/docs/README.md b/docs/README.md index 8099941..e10aa7c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,7 @@ * [Introduction](Introduction.md) * [Getting Started](GettingStarted.md) * [Extractors](Extractors/README.md) + * [Aggregator](Extractors/Aggregator.md) * [Collection](Extractors/Collection.md) * [CSV](Extractors/Csv.md) * [Fixed Width](Extractors/FixedWidth.md) diff --git a/docs/RunningProcesses.md b/docs/RunningProcesses.md index c62a8cc..8010096 100644 --- a/docs/RunningProcesses.md +++ b/docs/RunningProcesses.md @@ -9,6 +9,14 @@ $etl->extract(/* ... */) ->run(); ``` +To return the resulting data as an iterator (ex.: [Chaining ETL's usecase](../tests/UseCases/ChainingTest.php)), you may use the `toIterator` method: + +```php +$iterator = $etl->extract(/* ... */) + ->transform(/* ... */) + ->toIterator(); +``` + To run the process and return the resulting data as an array, you may use the `toArray` method: ```php @@ -16,4 +24,4 @@ $data = $etl->extract(/* ... */) ->transform(/* ... */) ->load(/* ... */) ->toArray(); -``` +``` \ No newline at end of file diff --git a/docs/img/etl.svg b/docs/img/etl.svg new file mode 100644 index 0000000..53524f1 --- /dev/null +++ b/docs/img/etl.svg @@ -0,0 +1,1203 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + Csv + + + + + + + + + + + + + + Extractor + + + + Pipeline + + + + + + + + + + + Loader + + + + + + + + + + + + load + + + + + Transformer + + + + transform + + + + + + + + + + + + + + + + + + + + ETL + + + extract + + + + + + + + + + + + + + + + + + + + + + run + + + + + + + + + + + + toIterator + + + + toArray + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Xml + + + + + + + + + + + + Aggregator + + + + + + + + + + + + + + ASSETS + COLORS + + + + + + + + + + + Json + + + + diff --git a/src/Etl.php b/src/Etl.php index 4a9dce5..f5c1dfc 100644 --- a/src/Etl.php +++ b/src/Etl.php @@ -113,7 +113,7 @@ public function toArray(): array } /** - * Consume the pipeline as a Generator + * Consume the pipeline as a Generator. * * @return \Generator */ diff --git a/src/Exception/IncompleteDataException.php b/src/Exception/IncompleteDataException.php new file mode 100644 index 0000000..575007d --- /dev/null +++ b/src/Exception/IncompleteDataException.php @@ -0,0 +1,15 @@ + + * @copyright Copyright (c) Wizacha + * @license MIT + */ + +namespace Wizaplace\Etl\Exception; + +class IncompleteDataException extends \Exception +{ +} diff --git a/src/Exception/InvalidOptionException.php b/src/Exception/InvalidOptionException.php new file mode 100644 index 0000000..17020a4 --- /dev/null +++ b/src/Exception/InvalidOptionException.php @@ -0,0 +1,15 @@ + + * @copyright Copyright (c) Wizacha + * @license MIT + */ + +namespace Wizaplace\Etl\Exception; + +class InvalidOptionException extends \Exception +{ +} diff --git a/src/Exception/UndefinedIndexException.php b/src/Exception/UndefinedIndexException.php new file mode 100644 index 0000000..49f0ee5 --- /dev/null +++ b/src/Exception/UndefinedIndexException.php @@ -0,0 +1,15 @@ + + * @copyright Copyright (c) Wizacha + * @license MIT + */ + +namespace Wizaplace\Etl\Exception; + +class UndefinedIndexException extends \Exception +{ +} diff --git a/src/Extractors/Aggregator.php b/src/Extractors/Aggregator.php new file mode 100644 index 0000000..a467c86 --- /dev/null +++ b/src/Extractors/Aggregator.php @@ -0,0 +1,183 @@ + + * @copyright Copyright (c) Wizacha + * @license MIT + */ + +namespace Wizaplace\Etl\Extractors; + +use Wizaplace\Etl\DirtyRow; +use Wizaplace\Etl\Exception\IncompleteDataException; +use Wizaplace\Etl\Exception\InvalidOptionException; +use Wizaplace\Etl\Exception\UndefinedIndexException; +use Wizaplace\Etl\Row; + +class Aggregator extends Extractor +{ + /** + * The matching key tuplet between iterators. + * + * @var string[] + */ + protected $index; + + /** + * Columns. + * + * @var string[] + */ + protected $columns; + + /** + * If set to true, + * will throw a MissingDataException if there is any incomplete rows remaining + * when all input iterators are fully consumed and closed. + * + * @var bool + */ + protected $strict = true; + + /** @var array[] */ + protected $data; + + /** + * Properties that can be set via the options method. + * + * @var array + */ + protected $availableOptions = [ + 'index', + 'columns', + 'strict', + ]; + + /** + * Properties that MUST be set via the options method. + * + * @var array + */ + protected $requiredOptions = [ + 'index', + 'columns', + ]; + + /** + * @return \Generator + * + * @throws IncompleteDataException + */ + public function extract(): \Generator + { + // consume input iterators + do { + foreach ($this->input as $iterator) { + if ( + ($line = $iterator->current()) + && ($row = $this->build($line)) + ) { + yield new Row($row); + } + $iterator->next(); + } + } while ( + $this->hasValidInput() + ); + + $incompletes = \count($this->data); + if ($this->strict && $incompletes) { + throw new IncompleteDataException("$incompletes rows"); + } + + // then yield the incomplete remaining rows + foreach ($this->data as $row) { + yield (new Row($row))->setIncomplete(); + } + } + + /** + * Accumulate row data and return when completed. + * + * @param mixed[] $line + * + * @return mixed[] + */ + protected function build(array $line): ?array + { + try { + $hash = $this->lineHash($line); + } catch (UndefinedIndexException $exception) { + return null; + } + + $this->data[$hash] = \array_merge( + $this->data[$hash] ?? [], + $line + ); + + if ($this->isCompleted($hash)) { + $row = $this->data[$hash]; + unset($this->data[$hash]); // free the RAM + + return $row; + } + + return null; + } + + /** + * Check if row is completed. + */ + protected function isCompleted(string $hash): bool + { + if (!\is_array($this->columns)) { + throw new InvalidOptionException('invalid columns', 2); + } + + foreach ($this->columns as $key) { + if (!\array_key_exists($key, $this->data[$hash])) { + return false; + } + } + + return true; + } + + /** + * Check if there is any opened iterators left. + */ + protected function hasValidInput(): bool + { + foreach ($this->input as $iterator) { + if ($iterator->valid()) { + return true; + } + } + + return false; + } + + /** + * calculate row hash key from specified index array. + */ + protected function lineHash(array $line): string + { + if (!\is_array($this->index)) { + throw new InvalidOptionException('Invalid index', 1); + } + + return \json_encode( + \array_map( + function (string $key) use ($line) { + if (!\array_key_exists($key, $line)) { + throw new UndefinedIndexException(); + } + + return $line[$key]; + }, + $this->index + ) + ); + } +} diff --git a/src/Row.php b/src/Row.php index 17cb5be..04c832b 100644 --- a/src/Row.php +++ b/src/Row.php @@ -27,6 +27,13 @@ class Row implements \ArrayAccess */ protected $discarded = false; + /** + * Flag the row as incomplete + * + * @var bool + */ + protected $incomplete = false; + /** * Create a new Row instance. * @@ -116,6 +123,24 @@ public function discarded(): bool return $this->discarded; } + /** + * Set the row dirty + */ + public function setIncomplete(): self + { + $this->incomplete = true; + + return $this; + } + + /** + * Check if the is dirty + */ + public function isIncomplete(): bool + { + return $this->incomplete; + } + /** * Dynamically retrieve attributes on the row. * diff --git a/tests/Extractors/AggregatorTest.php b/tests/Extractors/AggregatorTest.php new file mode 100644 index 0000000..e89b35c --- /dev/null +++ b/tests/Extractors/AggregatorTest.php @@ -0,0 +1,208 @@ + + * @copyright Copyright (c) Wizacha + * @copyright Copyright (c) Leonardo Marquine + * @license MIT + */ + +namespace Tests\Extractors; + +use Tests\TestCase; +use Wizaplace\Etl\DirtyRow; +use Wizaplace\Etl\Exception\IncompleteDataException; +use Wizaplace\Etl\Exception\InvalidOptionException; +use Wizaplace\Etl\Extractors\Aggregator; +use Wizaplace\Etl\Row; + +class AggregatorTest extends TestCase +{ + /** + * @test + * + * @dataProvider invalidOptionsProvider + **/ + public function invalid_index_options($invalidOptions, $exceptionCode) + { + $extractor = new Aggregator(); + + $extractor + ->input( + $this->iteratorsProvider()[0][0] // 👀️ + ) + ->options( + array_merge( + $invalidOptions, + ['strict' => false] + ) + ) + ; + + $this->expectException(InvalidOptionException::class); + $this->expectExceptionCode($exceptionCode); + iterator_to_array($extractor->extract()); + } + + /** + * @test + * + * @dataProvider iteratorsProvider + **/ + public function strict_index_matching($iterators) + { + $extractor = new Aggregator(); + + $extractor + ->input($iterators) + ->options( + [ + 'index' => ['email'], + 'columns' => ['name', 'twitter'], + 'strict' => true, + ] + ); + + $this->expectException(IncompleteDataException::class); + iterator_to_array($extractor->extract()); + } + + /** + * @test + * + * @dataProvider iteratorsProvider + **/ + public function unstrict_index_matching($iterators) + { + $extractor = new Aggregator(); + + $extractor + ->input($iterators) + ->options( + [ + 'index' => ['email'], + 'columns' => ['name', 'twitter'], + 'strict' => false, + ] + ); + + $actual = iterator_to_array($extractor->extract()); + $expected = [ + new Row([ + 'id' => 1, + 'name' => 'John Doe', + 'email' => 'johndoe@email.com', + 'twitter' => '@john', + ]), + new Row([ + 'id' => 2, + 'name' => 'Jane Doe', + 'email' => 'janedoe@email.com', + 'twitter' => '@jane', + ]), + ( + new Row([ + 'id' => 3, + 'name' => 'Incomplete', + 'email' => 'incomplete@dirtydata', + ]) + ) + ->setIncomplete(), + ]; + static::assertEquals($expected, $actual); + } + + /** + * @test + */ + public function big_shuffled_data_set() + { + $expected = 10 ** 4; + + $iterator = function (string $key, string $template) use ($expected): \Generator { + $ids = range(1, $expected); + shuffle($ids); + foreach ($ids as $id) { + yield [ + 'id' => $id, + "useseless_$id" => 'nevermind', + $key => sprintf($template, $id), + ]; + } + }; + + $extractor = new Aggregator(); + + $extractor + ->input( + [ + $iterator('email', 'user_%s@email.com'), + $iterator('name', 'name_%s'), + $iterator('info', 'info_%s'), + $iterator('stuff', 'stuff_%s'), + ] + ) + ->options( + [ + 'index' => ['id'], + 'columns' => ['email', 'name', 'info', 'stuff'], + ] + ); + + $actual = 0; + foreach ($extractor->extract() as $row) { + ++$actual; + } + static::assertEquals($expected, $actual); + } + + public function invalidOptionsProvider() + { + return [ + 'invalid index' => [ + [ + 'columns' => ['name', 'id'], + ], + 'error_code' => 1, + ], + 'invalid columns' => [ + [ + 'index' => ['email'], + ], + 'error_code' => 2, + ], + ]; + } + + public function iteratorsProvider(): array + { + $simpleDataSet = + [ + [ + $this->arrayToIterator([ + ['id' => 1, 'name' => 'John Doe', 'email' => 'johndoe@email.com'], + ['impossible error'], // should not happen + ['id' => 2, 'name' => 'Jane Doe', 'email' => 'janedoe@email.com'], + ['id' => 3, 'name' => 'Incomplete', 'email' => 'incomplete@dirtydata'], + ]), + $this->arrayToIterator([ + ['email' => 'janedoe@email.com', 'twitter' => '@jane'], + ['email' => 'johndoe@email.com', 'twitter' => '@john'], + ['impossible error'], // should not happen as well + ]), + ], + ] + ; + + return [$simpleDataSet]; + } + + public function arrayToIterator(array $lines): \Iterator + { + foreach ($lines as $line) { + yield $line; + } + } +} diff --git a/tests/Usecases/ChainingTest.php b/tests/Usecases/ChainingTest.php new file mode 100644 index 0000000..65c8f23 --- /dev/null +++ b/tests/Usecases/ChainingTest.php @@ -0,0 +1,118 @@ + + * @copyright Copyright (c) Wizacha + * @copyright Copyright (c) Leonardo Marquine + * @license MIT + */ + +namespace Tests\Usecases; + +use PHPUnit\Framework\TestCase; +use Wizaplace\Etl\Etl; +use Wizaplace\Etl\Extractors\Aggregator; +use Wizaplace\Etl\Extractors\Csv; +use Wizaplace\Etl\Transformers\ConvertCase; +use Wizaplace\Etl\Transformers\RenameColumns; + +class ChainingTest extends TestCase +{ + /** + * Merging data from two different source + * using a common matching column data + * + * @test + */ + public function merging_iterators_chaining() + { + // lazy get users + $usersIterator = (new Etl()) + ->extract( + new Csv(), + __DIR__ . '/data/users.csv', + ['delimiter' => ';'] + ) + ->toIterator(); + + // lazy get extended user info + $infosIterator = (new Etl()) + ->extract( + new Csv(), + __DIR__ . '/data/infos.csv', + ['delimiter' => ';'] + ) + ->transform( + new RenameColumns(), + [ + 'columns' => [ + 'courriel' => 'email' + ] + ] + ) + ->transform( + new ConvertCase(), + [ + 'columns' => ['email'], + 'mode' => 'lower', + ] + ) + ->toIterator(); + + // and finally lazy merge these iterators data + $usersInfosIterator = (new Etl()) + ->extract( + new Aggregator(), + [ + $usersIterator, + $infosIterator, + ], + [ + 'index' => ['email'], + 'columns' => [ + 'id', + 'email', + 'name', + 'age' + ], + 'strict' => false + ] + ) + ->toIterator(); + + $expected = [ + [ + 'id' => '1', + 'name' => 'John Doe', + 'email' => 'johndoe@email.com', + 'age' => '42', + ], + [ + 'id' => '2', + 'name' => 'Jane Doe', + 'email' => 'janedoe@email.com', + 'age' => '39', + ], + [ + 'id' => '3', + 'name' => 'Hello World', + 'email' => 'hello@world.com', + ], + [ + 'age' => '1000', + 'email' => 'glinglin@email.com', + ] + ]; + + $actual = iterator_to_array( + $usersInfosIterator + ); + + static::assertSame( + $expected, + $actual + ); + } +} diff --git a/tests/Usecases/data/infos.csv b/tests/Usecases/data/infos.csv new file mode 100644 index 0000000..dfdee69 --- /dev/null +++ b/tests/Usecases/data/infos.csv @@ -0,0 +1,4 @@ +"courriel";"age" +"JOHNDOE@EMAIL.COM";42 +"janedoe@email.com";39 +"glinglin@email.com";1000 \ No newline at end of file diff --git a/tests/Usecases/data/users.csv b/tests/Usecases/data/users.csv new file mode 100644 index 0000000..83a408f --- /dev/null +++ b/tests/Usecases/data/users.csv @@ -0,0 +1,4 @@ +"id";"name";"email" +1;"John Doe";"johndoe@email.com" +2;"Jane Doe";"janedoe@email.com" +3;"Hello World";"hello@world.com" \ No newline at end of file