-
Notifications
You must be signed in to change notification settings - Fork 0
/
job_spark_glue.py
38 lines (33 loc) · 907 Bytes
/
job_spark_glue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# Ler os dados enem 2020
enem = (
spark
.read
.format("csv")
.option("header", True)
.option("inferSchema", True)
.option("delimiter", ";")
.load("s3://datalake-adriano-523003372975/raw-data/data/MICRODADOS_ENEM_2020.csv")
)
# Escrever os dados enem 2020 em formato Parquet no Datalake
(
enem
.write
.mode("overwrite")
.format("parquet")
.partitionBy("SG_UF_PROVA")
.save("s3://datalake-adriano-523003372975/consumer-zone/")
)
job.commit()