From c5bac39417e9e1cbf114c0f66846c93bb27d0d5b Mon Sep 17 00:00:00 2001 From: Chris Hambridge Date: Fri, 17 Sep 2021 14:54:44 -0400 Subject: [PATCH] Create script to produce schema yaml file. * Use s3cmd to walk ceph files * Determine partitions by key structure (contains "=") * Download a parquet file from each leaf directory * Load parquet file to obtain columns * Produce table definition yaml file from extracted data --- .env.example | 7 ++ .gitignore | 3 + Pipfile | 16 ++++ Pipfile.lock | 222 ++++++++++++++++++++++++++++++++++++++++++++++ README.md | 50 +++++++++++ gen_table_defs.py | 116 ++++++++++++++++++++++++ 6 files changed, 414 insertions(+) create mode 100644 .env.example create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 gen_table_defs.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..18c4434 --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +S3_ENDPOINT=endpoint +AWS_ACCESS_KEY=AWS_ACCESS_KEY +AWS_SECRET_KEY=AWS_SECRET_KEY +S3_BUCKET=bucket +S3_BUCKET_PREFIX=data +SCHEMA_NAME=myschema +OUTPUT_FILE=out.yaml \ No newline at end of file diff --git a/.gitignore b/.gitignore index b6e4761..3f8b489 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# output file +cost-management.yaml diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..fb50513 --- /dev/null +++ b/Pipfile @@ -0,0 +1,16 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +minio = "*" +pandas = "*" +pyarrow = "*" +s3cmd = "*" +pyyaml = "*" + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..f4981fb --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,222 @@ +{ + "_meta": { + "hash": { + "sha256": "3a9a9afd17f2e409039dbadd3cfc98cb9dc5b00388b557eb044cf0941c8f3b58" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "certifi": { + "hashes": [ + "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee", + "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8" + ], + "version": "==2021.5.30" + }, + "minio": { + "hashes": [ + "sha256:90b853a48422240028d0720668808d2cc7498b5f843271b533ba6fa91e3e3797", + "sha256:c78d5559b3c37b0b3a09983aade272d6ec2a437e02335a44949f30000e5e46a4" + ], + "index": "pypi", + "version": "==7.1.0" + }, + "numpy": { + "hashes": [ + "sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94", + "sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080", + "sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e", + "sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c", + "sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76", + "sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371", + "sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c", + "sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2", + "sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a", + "sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb", + "sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140", + "sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28", + "sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f", + "sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d", + "sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff", + "sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8", + "sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa", + "sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea", + "sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc", + "sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73", + "sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d", + "sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d", + "sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4", + "sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c", + "sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e", + "sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea", + "sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd", + "sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f", + "sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff", + "sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e", + "sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7", + "sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa", + "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827", + "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60" + ], + "markers": "python_version >= '3.6'", + "version": "==1.19.5" + }, + "pandas": { + "hashes": [ + "sha256:0a643bae4283a37732ddfcecab3f62dd082996021b980f580903f4e8e01b3c5b", + "sha256:0de3ddb414d30798cbf56e642d82cac30a80223ad6fe484d66c0ce01a84d6f2f", + "sha256:19a2148a1d02791352e9fa637899a78e371a3516ac6da5c4edc718f60cbae648", + "sha256:21b5a2b033380adbdd36b3116faaf9a4663e375325831dac1b519a44f9e439bb", + "sha256:24c7f8d4aee71bfa6401faeba367dd654f696a77151a8a28bc2013f7ced4af98", + "sha256:26fa92d3ac743a149a31b21d6f4337b0594b6302ea5575b37af9ca9611e8981a", + "sha256:2860a97cbb25444ffc0088b457da0a79dc79f9c601238a3e0644312fcc14bf11", + "sha256:2b1c6cd28a0dfda75c7b5957363333f01d370936e4c6276b7b8e696dd500582a", + "sha256:2c2f7c670ea4e60318e4b7e474d56447cf0c7d83b3c2a5405a0dbb2600b9c48e", + "sha256:3be7a7a0ca71a2640e81d9276f526bca63505850add10206d0da2e8a0a325dae", + "sha256:4c62e94d5d49db116bef1bd5c2486723a292d79409fc9abd51adf9e05329101d", + "sha256:5008374ebb990dad9ed48b0f5d0038124c73748f5384cc8c46904dace27082d9", + "sha256:5447ea7af4005b0daf695a316a423b96374c9c73ffbd4533209c5ddc369e644b", + "sha256:573fba5b05bf2c69271a32e52399c8de599e4a15ab7cec47d3b9c904125ab788", + "sha256:5a780260afc88268a9d3ac3511d8f494fdcf637eece62fb9eb656a63d53eb7ca", + "sha256:70865f96bb38fec46f7ebd66d4b5cfd0aa6b842073f298d621385ae3898d28b5", + "sha256:731568be71fba1e13cae212c362f3d2ca8932e83cb1b85e3f1b4dd77d019254a", + "sha256:b61080750d19a0122469ab59b087380721d6b72a4e7d962e4d7e63e0c4504814", + "sha256:bf23a3b54d128b50f4f9d4675b3c1857a688cc6731a32f931837d72effb2698d", + "sha256:c16d59c15d946111d2716856dd5479221c9e4f2f5c7bc2d617f39d870031e086", + "sha256:c61c043aafb69329d0f961b19faa30b1dab709dd34c9388143fc55680059e55a", + "sha256:c94ff2780a1fd89f190390130d6d36173ca59fcfb3fe0ff596f9a56518191ccb", + "sha256:edda9bacc3843dfbeebaf7a701763e68e741b08fccb889c003b0a52f0ee95782", + "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b" + ], + "index": "pypi", + "version": "==1.1.5" + }, + "pyarrow": { + "hashes": [ + "sha256:1832709281efefa4f199c639e9f429678286329860188e53beeda71750775923", + "sha256:1d9485741e497ccc516cb0a0c8f56e22be55aea815be185c3f9a681323b0e614", + "sha256:24e64ea33eed07441cc0e80c949e3a1b48211a1add8953268391d250f4d39922", + "sha256:2d26186ca9748a1fb89ae6c1fa04fb343a4279b53f118734ea8096f15d66c820", + "sha256:357605665fbefb573d40939b13a684c2490b6ed1ab4a5de8dd246db4ab02e5a4", + "sha256:4341ac0f552dc04c450751e049976940c7f4f8f2dae03685cc465ebe0a61e231", + "sha256:456a4488ae810a0569d1adf87dbc522bcc9a0e4a8d1809b934ca28c163d8edce", + "sha256:4d8adda1892ef4553c4804af7f67cce484f4d6371564e2d8374b8e2bc85293e2", + "sha256:53e550dec60d1ab86cba3afa1719dc179a8bc9632a0e50d9fe91499cf0a7f2bc", + "sha256:5c0d1b68e67bb334a5af0cecdf9b6a702aaa4cc259c5cbb71b25bbed40fcedaf", + "sha256:601b0aabd6fb066429e706282934d4d8d38f53bdb8d82da9576be49f07eedf5c", + "sha256:64f30aa6b28b666a925d11c239344741850eb97c29d3aa0f7187918cf82494f7", + "sha256:6e1f0e4374061116f40e541408a8a170c170d0a070b788717e18165ebfdd2a54", + "sha256:6e937ce4a40ea0cc7896faff96adecadd4485beb53fbf510b46858e29b2e75ae", + "sha256:7560332e5846f0e7830b377c14c93624e24a17f91c98f0b25dafb0ca1ea6ba02", + "sha256:7c4edd2bacee3eea6c8c28bddb02347f9d41a55ec9692c71c6de6e47c62a7f0d", + "sha256:99c8b0f7e2ce2541dd4c0c0101d9944bb8e592ae3295fe7a2f290ab99222666d", + "sha256:9e04d3621b9f2f23898eed0d044203f66c156d880f02c5534a7f9947ebb1a4af", + "sha256:b1453c2411b5062ba6bf6832dbc4df211ad625f678c623a2ee177aee158f199b", + "sha256:b3115df938b8d7a7372911a3cb3904196194bcea8bb48911b4b3eafee3ab8d90", + "sha256:b6387d2058d95fa48ccfedea810a768187affb62f4a3ef6595fa30bf9d1a65cf", + "sha256:bbe2e439bec2618c74a3bb259700c8a7353dc2ea0c5a62686b6cf04a50ab1e0d", + "sha256:c3fc856f107ca2fb3c9391d7ea33bbb33f3a1c2b4a0e2b41f7525c626214cc03", + "sha256:c5493d2414d0d690a738aac8dd6d38518d1f9b870e52e24f89d8d7eb3afd4161", + "sha256:e9ec80f4a77057498cf4c5965389e42e7f6a618b6859e6dd615e57505c9167a6", + "sha256:ed135a99975380c27077f9d0e210aea8618ed9fadcec0e71f8a3190939557afe", + "sha256:f4db312e9ba80e730cefcae0a05b63ea5befc7634c28df56682b628ad8e1c25c", + "sha256:ff21711f6ff3b0bc90abc8ca8169e676faeb2401ddc1a0bc1c7dc181708a3406" + ], + "index": "pypi", + "version": "==5.0.0" + }, + "python-dateutil": { + "hashes": [ + "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", + "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "version": "==2.8.2" + }, + "python-magic": { + "hashes": [ + "sha256:4fec8ee805fea30c07afccd1592c0f17977089895bdfaae5fec870a84e997626", + "sha256:de800df9fb50f8ec5974761054a708af6e4246b03b4bdaee993f948947b0ebcf" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.4.24" + }, + "pytz": { + "hashes": [ + "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da", + "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798" + ], + "version": "==2021.1" + }, + "pyyaml": { + "hashes": [ + "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf", + "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696", + "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393", + "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77", + "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922", + "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5", + "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8", + "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10", + "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc", + "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018", + "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e", + "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253", + "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347", + "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183", + "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541", + "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb", + "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185", + "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc", + "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db", + "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa", + "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46", + "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122", + "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b", + "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63", + "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df", + "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc", + "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247", + "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6", + "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0" + ], + "index": "pypi", + "version": "==5.4.1" + }, + "s3cmd": { + "hashes": [ + "sha256:49cd23d516b17974b22b611a95ce4d93fe326feaa07320bd1d234fed68cbccfa", + "sha256:966b0a494a916fc3b4324de38f089c86c70ee90e8e1cae6d59102103a4c0cc03" + ], + "index": "pypi", + "version": "==2.1.0" + }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "version": "==1.16.0" + }, + "urllib3": { + "hashes": [ + "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4", + "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.26.6" + } + }, + "develop": {} +} diff --git a/README.md b/README.md index e04a692..c48cef6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,52 @@ # ceph_trino_schema_gen Generate Trino schema from an Ceph S3 bucket + + +# Getting Started + +Start by cloning the repository: +``` +git clone https://github.com/chambridge/ceph_trino_schema_gen.git +``` + +Switch to the new directory: +``` +cd ceph_trino_schema_gen +``` + +Create Python 3.9 virual enviroment: +``` +pipenv --python 3.9 +pipenv install +``` + +Copy and configure connection to your Ceph bucket: +``` +cp .env.example .env +``` + +Enter the virtual env: +``` +pipenv shell +``` + +Execute the python script: +``` +python gen_table_defs.py +``` + +_Note:_ You may encounter the following error with Python 3.9 if the dependency has not been fixed yet: +``` +AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getchildren' +``` + +In order to resolve the problem you need to remove the `.getchildren()` method calls in *s3cmd* locally. +To do this find the location of `s3scmd` in your virtual environment: +``` +which s3cmd +``` +Open a terminal to the python directory listed. Change to the S3 site-package: +``` +cd lib/python3.9/site-packages/S3/ +``` +Remove all occurrences of `.getchildren()` from the code. Now the python script should run properly. \ No newline at end of file diff --git a/gen_table_defs.py b/gen_table_defs.py new file mode 100644 index 0000000..94cd2b6 --- /dev/null +++ b/gen_table_defs.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +import io +import os +import pandas as pd +import pyarrow.parquet as pq +import yaml + +from S3.S3 import S3 +from S3.S3Uri import S3Uri +from S3.Config import Config +from S3.Exceptions import S3Error, S3DownloadError +from S3.Utils import formatSize, formatDateTime + +import base64 +base64.encodestring = base64.encodebytes + + +S3_ENDPOINT = os.getenv("S3_ENDPOINT") +AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY") +AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY") +S3_BUCKET = os.getenv("S3_BUCKET") +S3_BUCKET_PREFIX = os.getenv("S3_BUCKET_PREFIX") +TMPDIR = os.getenv("TMPDIR") +SCHEMA_NAME = os.getenv("SCHEMA_NAME") +OUTPUT_FILE = os.getenv("OUTPUT_FILE") + +cfg = Config() +cfg.access_key=AWS_ACCESS_KEY +cfg.secret_key=AWS_SECRET_KEY +cfg.host_base=S3_ENDPOINT +cfg.host_bucket=f"{S3_ENDPOINT}/{S3_BUCKET}/" +s3 = S3(cfg) +uri = S3Uri(f"s3://{S3_BUCKET}/{S3_BUCKET_PREFIX}") +format_string = u"%(timestamp)16s %(size)s %(uri)s" +bucket = uri.bucket() +prefix = uri.object() +dir_str = "DIR" +data_dict = {} +partitions_str = "partitions" +schema_def = {"schema": f"{SCHEMA_NAME}}", "tables": []} + + +def download_file(s3, uri, destination): + dst_stream = io.open(destination, mode='wb') + dst_stream.stream_name = destination + + try: + try: + response = s3.object_get(uri, dst_stream, destination, start_position = 0) + finally: + dst_stream.close() + except S3DownloadError as e: + print(u"Download of '%s' failed (Reason: %s)" % (destination, e)) + print(u"object_get failed for '%s', deleting..." % (destination,)) + os.unlink(destination) + except S3Error as e: + print(u"Download of '%s' failed (Reason: %s)" % (destination, e)) + print(u"object_get failed for '%s', deleting..." % (destination,)) + os.unlink(destination) + raise + + +def list_bucket(cfg, s3, bucket, prefix, data_dict, location=None): + if prefix.endswith('*'): + prefix = prefix[:-1] + try: + response = s3.bucket_list(bucket, prefix = prefix, limit = cfg.limit) + except S3Error as e: + if e.info["Code"] in S3.codes: + print(S3.codes[e.info["Code"]] % bucket) + raise + + table_name = location + for cprefix in response['common_prefixes']: + new_dir = cprefix["Prefix"].replace(prefix, "") + first_partition = "=" not in prefix + is_partition = "=" in new_dir + + if is_partition: + partition = new_dir.split("=") + if first_partition: + table_name = prefix + data_dict[table_name] = {partitions_str: []} + + data_dict[table_name][partitions_str].append(partition[0]) + + list_bucket(cfg, s3, bucket, cprefix["Prefix"], data_dict, table_name) + + if table_name: + data_dict[table_name][partitions_str]=list(set(data_dict[table_name][partitions_str])) + + for object in response["list"]: + data_dict[table_name]["file"] = object["Key"] + break + + +list_bucket(cfg, s3, bucket, prefix, data_dict) + +for table_location, tdata in data_dict.items(): + if table_location.endswith("/"): + table_location = table_location[:-1] + table_name = table_location.replace(S3_BUCKET_PREFIX, "").replace("/", "_") + filepath = tdata["file"] + file_name = filepath.split("/")[-1] + destination = f"{TMPDIR}/{file_name}" + file_uri = S3Uri(f"s3://{S3_BUCKET}/{filepath}") + download_file(s3, file_uri, destination) + parquet_data = pq.read_table(destination) + df = parquet_data.to_pandas() + table_dict = {"name": table_name, "location": f"s3://{S3_BUCKET}/{table_location}", "format": "parquet", "columns": [], "partitions": tdata[partitions_str]} + for col in df.columns: + table_dict["columns"].append(col) + schema_def["tables"].append(table_dict) + +with open(f"./{OUTPUT_FILE}}", "w") as file: + documents = yaml.dump(schema_def, file) \ No newline at end of file