diff --git a/src/Iterator/CSVIterator.php b/src/Iterator/CSVIterator.php index 6e6cae8..8b2dd09 100644 --- a/src/Iterator/CSVIterator.php +++ b/src/Iterator/CSVIterator.php @@ -8,6 +8,7 @@ use BenTools\ETL\Normalizer\NumericStringToNumberNormalizer; use BenTools\ETL\Normalizer\ValueNormalizerInterface; use IteratorAggregate; +use SplFileObject; use Symfony\Component\OptionsResolver\OptionsResolver; use Traversable; @@ -61,39 +62,88 @@ public function __construct( $this->options = $resolver->resolve($options); } + /** + * @param array $data + * @param list|null $columns + * + * @return array|string[] + */ + private function extract(array $data, ?array $columns): array + { + if ($this->options['normalizers']) { + array_walk($data, function (&$value) { + foreach ($this->options['normalizers'] as $normalizer) { + $value = $normalizer->normalize($value); + } + + return $value; + }); + } + + return !empty($columns) ? self::combine($columns, $data) : $data; + } + public function getIterator(): Traversable { + if ($this->text instanceof SplFileObject) { + return $this->iterateFromFile($this->text); + } + + return $this->iterateFromContent($this->text); + } + + /** + * @return Traversable + */ + private function iterateFromFile(SplFileObject $file): Traversable + { + $flags = [SplFileObject::READ_CSV, $file->getFlags()]; + $file->setFlags(array_reduce($flags, fn ($a, $b) => $a | $b, 0)); $columns = $this->options['columns']; if ('auto' === $columns) { $columns = null; } - foreach ($this->text as $r => $row) { - $fields = str_getcsv( - $row, + while (!$file->eof()) { + $fields = $file->fgetcsv( $this->options['delimiter'], $this->options['enclosure'], $this->options['escapeString'], ); - if (0 === $r && 'auto' === $this->options['columns']) { + if ([null] === $fields) { + continue; + } + if ('auto' === $this->options['columns'] && 0 === $file->key()) { $columns ??= $fields; continue; } - if ($this->options['normalizers']) { - array_walk($fields, function (&$value) { - foreach ($this->options['normalizers'] as $normalizer) { - $value = $normalizer->normalize($value); - } - - return $value; - }); - } + yield $this->extract($fields, $columns); + } + } - if (!empty($columns)) { - yield self::combine($columns, $fields); + /** + * @param Traversable $content + * + * @return Traversable + */ + private function iterateFromContent(Traversable $content): Traversable + { + $columns = $this->options['columns']; + if ('auto' === $columns) { + $columns = null; + } + foreach ($content as $r => $row) { + $fields = str_getcsv( + $row, + $this->options['delimiter'], + $this->options['enclosure'], + $this->options['escapeString'], + ); + if ('auto' === $this->options['columns'] && 0 === $r) { + $columns ??= $fields; continue; } - yield $fields; + yield $this->extract($fields, $columns); } } diff --git a/tests/Unit/Iterator/CSVIteratorTest.php b/tests/Unit/Iterator/CSVIteratorTest.php index af4c203..50c53f7 100644 --- a/tests/Unit/Iterator/CSVIteratorTest.php +++ b/tests/Unit/Iterator/CSVIteratorTest.php @@ -6,14 +6,14 @@ use BenTools\ETL\Iterator\CSVIterator; use BenTools\ETL\Iterator\StrTokIterator; +use SplFileObject; use function dirname; use function expect; use function Safe\file_get_contents; -it('iterates over CSV data', function () { - $content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'); - $rows = [...new CSVIterator(new StrTokIterator($content))]; +it('iterates over CSV data', function (CSVIterator $iterator) { + $rows = [...$iterator]; expect($rows)->toHaveCount(11) ->and($rows[0])->toBe([ @@ -30,11 +30,14 @@ 3 => 'Asia', 4 => 13929286, ]); +})->with(function () { + $filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'; + yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename))); + yield 'file' => new CSVIterator(new SplFileObject($filename)); }); -it('can make columns automatically', function () { - $content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'); - $rows = [...new CSVIterator(new StrTokIterator($content), ['columns' => 'auto'])]; +it('can make columns automatically', function (CSVIterator $iterator) { + $rows = [...$iterator]; expect($rows)->toHaveCount(10) ->and($rows[0])->toBe([ @@ -51,21 +54,14 @@ 'continent' => 'Asia', 'population' => 13929286, ]); +})->with(function () { + $filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'; + yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => 'auto']); + yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => 'auto']); }); -it('can map user-defined columns', function () { - $content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'); - $rows = [ - ...new CSVIterator(new StrTokIterator($content), [ - 'columns' => [ - 'cityEnglishName', - 'cityLocalName', - 'countryIsoCode', - 'continent', - 'population', - ], - ]), - ]; +it('can map user-defined columns', function (CSVIterator $iterator) { + $rows = [...$iterator]; expect($rows[1])->toBe([ 'cityEnglishName' => 'New York', @@ -81,22 +77,21 @@ 'continent' => 'Asia', 'population' => 13929286, ]); +})->with(function () { + $columns = [ + 'cityEnglishName', + 'cityLocalName', + 'countryIsoCode', + 'continent', + 'population', + ]; + $filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'; + yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]); + yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]); }); -it('adds fields when the row has not enough columns', function () { - $content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'); - $rows = [ - ...new CSVIterator(new StrTokIterator($content), [ - 'columns' => [ - 'cityEnglishName', - 'cityLocalName', - 'countryIsoCode', - 'continent', - 'population', - 'misc', - ], - ]), - ]; +it('adds fields when the row has not enough columns', function (CSVIterator $iterator) { + $rows = [...$iterator]; expect($rows[1])->toBe([ 'cityEnglishName' => 'New York', @@ -114,20 +109,22 @@ 'population' => 13929286, 'misc' => null, ]); +})->with(function () { + $columns = [ + 'cityEnglishName', + 'cityLocalName', + 'countryIsoCode', + 'continent', + 'population', + 'misc', + ]; + $filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'; + yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]); + yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]); }); -it('removes extra data whenever there are more fields than columns', function () { - $content = file_get_contents(dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'); - $rows = [ - ...new CSVIterator(new StrTokIterator($content), [ - 'columns' => [ - 'cityEnglishName', - 'cityLocalName', - 'countryIsoCode', - 'continent', - ], - ]), - ]; +it('removes extra data whenever there are more fields than columns', function (CSVIterator $iterator) { + $rows = [...$iterator]; expect($rows[1])->toBe([ 'cityEnglishName' => 'New York', @@ -141,4 +138,14 @@ 'countryIsoCode' => 'JP', 'continent' => 'Asia', ]); +})->with(function () { + $columns = [ + 'cityEnglishName', + 'cityLocalName', + 'countryIsoCode', + 'continent', + ]; + $filename = dirname(__DIR__, 2).'/Data/10-biggest-cities.csv'; + yield 'string content' => new CSVIterator(new StrTokIterator(file_get_contents($filename)), ['columns' => $columns]); + yield 'file' => new CSVIterator(new SplFileObject($filename), ['columns' => $columns]); });