UIUC-CS498-Cloud · gputcha2 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/Docker/Dockerfile b/Docker/Dockerfile
@@ -6,24 +6,28 @@ ENV PYSPARK_PYTHON=python3
 
 RUN apt clean && apt-get update && \
     apt-get install -y --no-install-recommends build-essential\
-	expect git vim zip unzip wget openjdk-8-jdk wget maven sudo curl
+	expect git vim zip unzip wget openjdk-21-jdk wget maven sudo curl
 RUN apt-get install -y python3 python3-pip
 
 
 ################################################################################
 ####################   Spark stuff   ###########################################
 ################################################################################
 
-# Download and install spark
-RUN curl -s "https://dlcdn.apache.org/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz" | tar -xz  -C /usr/local/ \
-    && ln -s /usr/local/spark-3.4.0-bin-hadoop3 /usr/local/spark \
-    && chmod a+rwx -R /usr/local/spark/
+
+RUN cd /usr/local/ &&\
+    wget "https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz" &&\
+	tar -xvzf spark-3.5.1-bin-hadoop3.tgz && \
+	ln -s ./spark-3.5.1-bin-hadoop3 spark &&  \
+	rm -rf /usr/local/spark-3.5.1-bin-hadoop3.tgz && \
+	rm -rf /usr/local/spark/external && \
+	chmod a+rwx -R /usr/local/spark/
 
 RUN pip3 install --upgrade pip
 RUN pip3 install Cython
 RUN pip3 install numpy
 
-RUN echo "alias spark-submit='/usr/local/spark/bin/spark-submit'" >> ~/.bashrc && source ~/.bashrc
+RUN echo "alias spark-submit='/usr/local/spark/bin/spark-submit'" >> ~/.bashrc
 
 # Ensure spark log output is redirected to stderr
 RUN cp /usr/local/spark/conf/log4j2.properties.template /usr/local/spark/conf/log4j2.properties
@@ -37,10 +41,10 @@ RUN chmod a+rwx -R /usr/local/spark/
 # ARM64: /usr/lib/jvm/java-8-openjdk-arm64
 # X84_64: /usr/lib/jvm/java-1.8.0-openjdk-amd64
 
-RUN if [ "$(uname -m)" = "x86_64" ]; then export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64; else export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-arm64; fi \
+RUN if [ "$(uname -m)" = "x86_64" ]; then export JAVA_HOME=/usr/lib/jvm/java-1.21.0-openjdk-amd64; else export JAVA_HOME=/usr/lib/jvm/java-21-openjdk-arm64; fi \
     && echo "export JAVA_HOME=$JAVA_HOME" >> /root/.bashrc
 
-RUN if [ "$(uname -m)" = "x86_64" ]; then update-java-alternatives --set java-1.8.0-openjdk-amd64; else update-java-alternatives --set java-1.8.0-openjdk-arm64; fi
+RUN if [ "$(uname -m)" = "x86_64" ]; then update-java-alternatives --set java-1.21.0-openjdk-amd64; else update-java-alternatives --set java-1.21.0-openjdk-arm64; fi
 
 # [Optional] Set working path to /cs498, and run the following command to start the container with code:
 # WORKDIR /cs498

diff --git a/Docker/README.md b/Docker/README.md
@@ -4,4 +4,4 @@ This is the Docker for MP8 SparkSQL.
 
 ## Log
 
-Last updated in April 2023, by Ruijie Wang (ruijiew2@illinois.edu).
+Last updated in February 2024, by Gautam Putcha (gputcha2@illinois.edu).
diff --git a/README.md b/README.md
@@ -3,8 +3,10 @@
 This is the Java and Python template for MP8 SparkSQL.
 
 ## Log 
+Updated in Feb 2024, by Gautam Putcha ([email protected]).
+
 Updated in May 2023, by Shujing Yang ([email protected]).
 
 Updated in Feb 2022, by Yifan Chen ([email protected]).
 
-Updated in April 2021, by Ruiyang Chen ([email protected]).
+Updated in April 2021, by Ruiyang Chen ([email protected]).
diff --git a/java/src/main/java/MP8_PartA.java b/java/src/main/java/MP8_PartA.java
@@ -29,8 +29,9 @@ public static void main(String[] args) throws Exception {
       .appName("MP8")
       .getOrCreate();
     JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+    SQLContext sqlContext = new SQLContext(sc);
     /*
-     * 1. Setup (16 points): write a function to load it in an RDD & DataFrame
+     * 1. Setup : write a function to load it in an RDD & DataFrame
      */
 
     // RDD API
@@ -45,3 +46,13 @@ public static void main(String[] args) throws Exception {
     sc.stop();
   }
 }
+
+/* Sample Output
+
++--------+
+|count(1)|
++--------+
+|   50013|
++--------+ 
+
+*/
diff --git a/java/src/main/java/MP8_PartB.java b/java/src/main/java/MP8_PartB.java
@@ -29,6 +29,7 @@ public static void main(String[] args) throws Exception {
       .appName("MP8")
       .getOrCreate();
     JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+    SQLContext sqlContext = new SQLContext(sc);
     /*
      * 1. Setup: write a function to load it in an RDD & DataFrame
      */
@@ -42,7 +43,7 @@ public static void main(String[] args) throws Exception {
 
 
     /*
-     * 2. Counting (16 points): How many lines does the file contains? Answer 
+     * 2. Counting : How many lines does the file contains? Answer 
      * this question via both RDD api & #Spark SQL
      */
 
@@ -52,3 +53,13 @@ public static void main(String[] args) throws Exception {
     sc.stop();
   }
 }
+
+/* Sample Output
+
++--------+
+|count(1)|
++--------+
+|   50013|
++--------+
+
+*/
diff --git a/java/src/main/java/MP8_PartC.java b/java/src/main/java/MP8_PartC.java
@@ -29,6 +29,7 @@ public static void main(String[] args) throws Exception {
       .appName("MP8")
       .getOrCreate();
     JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+    SQLContext sqlContext = new SQLContext(sc);
     /*
      * 1. Setup: write a function to load it in an RDD & DataFrame
      */
@@ -40,7 +41,7 @@ public static void main(String[] args) throws Exception {
     // Spark SQL - DataSet API
 
     /*
-     * 3. Filtering (16 points) Count the number of appearances of word 'ATTRIBUTE'
+     * 3. Filtering : Count the number of appearances of word 'ATTRIBUTE'
      */
     // Dataset/Spark SQL API
 
@@ -49,3 +50,13 @@ public static void main(String[] args) throws Exception {
     sc.stop();
   }
 }
+
+/* Sample Output
+
++--------+
+|count(1)|
++--------+
+|      11|
++--------+
+
+*/
diff --git a/java/src/main/java/MP8_PartD.java b/java/src/main/java/MP8_PartD.java
@@ -9,6 +9,7 @@
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
 import org.apache.spark.sql.Dataset;
+import static org.apache.spark.sql.functions.*;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
@@ -19,6 +20,7 @@
 import java.util.List;
 import java.util.ArrayList;
 import java.util.regex.Pattern;
+import java.util.Map;
 //import java.util.function.Function;
 
 public final class MP8_PartD {
@@ -29,6 +31,7 @@ public static void main(String[] args) throws Exception {
       .appName("MP8")
       .getOrCreate();
     JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+    SQLContext sqlContext = new SQLContext(sc);
     /*
      * 1. Setup: write a function to load it in an RDD & DataFrame
      */
@@ -41,7 +44,7 @@ public static void main(String[] args) throws Exception {
 
 
     /*
-     * 4. MapReduce (16 points): List the top three words that have appeared in the
+     * 4. MapReduce : List the top three words that have appeared in the
      * greatest number of years.
      */
 
@@ -52,3 +55,16 @@ public static void main(String[] args) throws Exception {
     sc.stop();
   }
 }
+
+/* Sample Output (may look slightly different for you due to ties with other words)
+
++-------------+--------+
+|         word|count(1)|
++-------------+--------+
+|    ATTRIBUTE|      11|
+|approximation|       4|
+|    agast_ADV|       4|
++-------------+--------+
+only showing top 3 rows
+
+*/
diff --git a/java/src/main/java/MP8_PartE.java b/java/src/main/java/MP8_PartE.java
@@ -29,6 +29,7 @@ public static void main(String[] args) throws Exception {
       .appName("MP8")
       .getOrCreate();
     JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+    SQLContext sqlContext = new SQLContext(sc);
     /*
      * 1. Setup: write a function to load it in an RDD & DataFrame
      */
@@ -42,14 +43,17 @@ public static void main(String[] args) throws Exception {
 
 
     /*
-     * 5. Joining (16 points): The following program construct a new dataframe out of 
+     * 5. Joining : The following program construct a new dataframe out of 
      * 'df' with a much smaller size, which will allow us to perform a JOIN operation.
      * Do a self-join on 'df2'in lines with the same 'count1' values and see how many 
      * lines this JOIN could produce. Answer this question via DataFrame API and Spark SQL API
      */
 
-    Dataset<Row> df2 = df.select("word", "year").distinct().limit(100);
-    df2.createOrReplaceTempView("gbooks2");
+    // Uncomment and use the below lines in your implementation as specified above
+
+    // Dataset<Row> df2 = df.select("word", "year").distinct().limit(100);
+    // df2.createOrReplaceTempView("gbooks2");
+
     // Spark SQL API
 
 
@@ -58,3 +62,9 @@ public static void main(String[] args) throws Exception {
     sc.stop();
   }
 }
+
+/* Sample Output
+
+166
+
+*/
diff --git a/java/src/main/java/MP8_PartF.java b/java/src/main/java/MP8_PartF.java
@@ -23,6 +23,7 @@ public static void main(String[] args) throws IOException {
         .appName("MP8")
         .getOrCreate();
         JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+        SQLContext sqlContext = new SQLContext(sc);
         /*
         * 1. Setup: write a function to load it in an RDD & DataFrame
         */
@@ -32,7 +33,7 @@ public static void main(String[] args) throws IOException {
 
 
         /**
-         * 2. Frequency Increase (16 points): analyze the frequency increase of words starting from the year 1500 to the year 2000
+         * 2. Frequency Increase : analyze the frequency increase of words starting from the year 1500 to the year 2000
          */
 
     }

diff --git a/python/MP8_PartB.py b/python/MP8_PartB.py
@@ -19,7 +19,7 @@
 # Spark SQL - DataFrame API
 
 ####
-# 2. Counting (16 points): How many lines does the file contains? Answer this question via both RDD api & #Spark SQL
+# 2. Counting : How many lines does the file contains? Answer this question via both RDD api & #Spark SQL
 ####
 
 # Spark SQL 

diff --git a/python/MP8_PartC.py b/python/MP8_PartC.py
@@ -18,7 +18,7 @@
 # Spark SQL - DataFrame API
 
 ####
-# 3. Filtering (16 points) Count the number of appearances of word 'ATTRIBUTE'
+# 3. Filtering : Count the number of appearances of word 'ATTRIBUTE'
 ####
 
 # Spark SQL

diff --git a/python/MP8_PartD.py b/python/MP8_PartD.py
@@ -20,7 +20,7 @@
 
 
 ####
-# 4. MapReduce (16 points): List the top three words that have appeared in the greatest number of years.
+# 4. MapReduce : List the top three words that have appeared in the greatest number of years.
 ####
 
 # Spark SQL
@@ -33,3 +33,5 @@
 # |    agast_ADV|       4|
 # +-------------+--------+
 # only showing top 3 rows
+
+# The above output may look slightly different for you due to ties with other words
diff --git a/python/MP8_PartE.py b/python/MP8_PartE.py
@@ -19,7 +19,7 @@
 
 
 ####
-# 5. Joining (16 points): The following program construct a new dataframe out of 'df' with a much smaller size.
+# 5. Joining : The following program construct a new dataframe out of 'df' with a much smaller size.
 ####
 
 df2 = df.select("word", "year").distinct().limit(100)
@@ -29,5 +29,5 @@
 
 # Spark SQL API
 
-# output: 162
+# output: 166
 
diff --git a/python/MP8_PartF.py b/python/MP8_PartF.py
@@ -20,7 +20,7 @@
 
 
 ###
-# 2. Frequency Increase (16 points): analyze the frequency increase of words starting from the year 1500 to the year 2000
+# 2. Frequency Increase : analyze the frequency increase of words starting from the year 1500 to the year 2000
 ###
 # Spark SQL - DataFrame API