forked from dogukannulu/kafka_spark_structured_streaming
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspark_streaming.py
98 lines (81 loc) · 3.59 KB
/
spark_streaming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import logging
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,FloatType,IntegerType,StringType
from pyspark.sql.functions import from_json,col
logging.basicConfig(level=logging.INFO,
format='%(asctime)s:%(funcName)s:%(levelname)s:%(message)s')
logger = logging.getLogger("spark_structured_streaming")
def create_spark_session():
"""
Creates the Spark Session with suitable configs.
"""
try:
# Spark session is established with cassandra and kafka jars. Suitable versions can be found in Maven repository.
spark = SparkSession \
.builder \
.appName("SparkStructuredStreaming") \
.config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0") \
.config("spark.cassandra.connection.host", "cassandra") \
.config("spark.cassandra.connection.port","9042")\
.config("spark.cassandra.auth.username", "cassandra") \
.config("spark.cassandra.auth.password", "cassandra") \
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
logging.info('Spark session created successfully')
except Exception:
logging.error("Couldn't create the spark session")
return spark
def create_initial_dataframe(spark_session):
"""
Reads the streaming data and creates the initial dataframe accordingly.
"""
try:
# Gets the streaming data from topic random_names
df = spark_session \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka1:19092,kafka2:19093,kafka3:19094") \
.option("subscribe", "random_names") \
.option("delimeter",",") \
.option("startingOffsets", "earliest") \
.load()
logging.info("Initial dataframe created successfully")
except Exception as e:
logging.warning(f"Initial dataframe couldn't be created due to exception: {e}")
return df
def create_final_dataframe(df, spark_session):
"""
Modifies the initial dataframe, and creates the final dataframe.
"""
schema = StructType([
StructField("full_name",StringType(),False),
StructField("gender",StringType(),False),
StructField("location",StringType(),False),
StructField("city",StringType(),False),
StructField("country",StringType(),False),
StructField("postcode",IntegerType(),False),
StructField("latitude",FloatType(),False),
StructField("longitude",FloatType(),False),
StructField("email",StringType(),False)
])
df = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),schema).alias("data")).select("data.*")
print(df)
return df
def start_streaming(df):
"""
Starts the streaming to table spark_streaming.random_names in cassandra
"""
logging.info("Streaming is being started...")
my_query = (df.writeStream
.format("org.apache.spark.sql.cassandra")
.outputMode("append")
.options(table="random_names", keyspace="spark_streaming")\
.start())
return my_query.awaitTermination()
def write_streaming_data():
spark = create_spark_session()
df = create_initial_dataframe(spark)
df_final = create_final_dataframe(df, spark)
start_streaming(df_final)
if __name__ == '__main__':
write_streaming_data()