From 3140a1b5412405084bb63379fc7423483a385a63 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 10 Nov 2023 17:56:48 +0000 Subject: [PATCH] eda update 2 --- eda_2021.ipynb | 724 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 703 insertions(+), 21 deletions(-) diff --git a/eda_2021.ipynb b/eda_2021.ipynb index 8bfd069..9c9fdf0 100644 --- a/eda_2021.ipynb +++ b/eda_2021.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "201288da-86ac-4db0-a56b-4d75e26e1753", "metadata": {}, "outputs": [], @@ -16,10 +16,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "3443992c-4530-48f2-a133-fb1dacf4b84f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[('spark.stage.maxConsecutiveAttempts', '10'),\n", + " ('spark.dynamicAllocation.minExecutors', '1'),\n", + " ('spark.eventLog.enabled', 'true'),\n", + " ('spark.submit.pyFiles',\n", + " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", + " ('spark.kryoserializer.buffer.max', '2000M'),\n", + " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", + " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", + " ('spark.driver.maxResultSize', '0'),\n", + " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", + " ('spark.sql.autoBroadcastJoinThreshold', '43m'),\n", + " ('spark.ui.filters',\n", + " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", + " 'http://hub-msca-bdp-dphub-students-abejburton-m:8088/proxy/application_1699633504496_0001'),\n", + " ('spark.metrics.namespace',\n", + " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", + " ('spark.executor.memory', '4g'),\n", + " ('spark.dataproc.sql.optimizer.leftsemijoin.conversion.enabled', 'true'),\n", + " ('spark.hadoop.hive.execution.engine', 'mr'),\n", + " ('spark.executor.id', 'driver'),\n", + " ('spark.app.startTime', '1699634010505'),\n", + " ('spark.driver.port', '35733'),\n", + " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", + " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m'),\n", + " ('spark.yarn.dist.pyFiles',\n", + " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m'),\n", + " ('spark.yarn.am.attemptFailuresValidityInterval', '1h'),\n", + " ('spark.app.name', 'Spark Updated Conf'),\n", + " ('spark.sql.catalogImplementation', 'hive'),\n", + " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", + " ('spark.yarn.secondary.jars',\n", + " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.repl.local.jars',\n", + " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.driver.appUIAddress',\n", + " 'http://hub-msca-bdp-dphub-students-abejburton-m.c.msca-bdp-student-ap.internal:37817'),\n", + " ('spark.driver.host',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m.c.msca-bdp-student-ap.internal'),\n", + " ('spark.sql.cbo.enabled', 'true'),\n", + " ('spark.yarn.historyServer.address',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m:18080'),\n", + " ('spark.executorEnv.PYTHONPATH',\n", + " '/usr/lib/spark/python/lib/py4j-0.10.9-src.zip:/usr/lib/spark/python/:{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar{{PWD}}/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar{{PWD}}/com.typesafe_config-1.4.2.jar{{PWD}}/org.rocksdb_rocksdbjni-6.29.5.jar{{PWD}}/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar{{PWD}}/com.github.universal-automata_liblevenshtein-3.0.0.jar{{PWD}}/com.google.cloud_google-cloud-storage-2.16.0.jar{{PWD}}/com.navigamez_greex-1.0.jar{{PWD}}/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar{{PWD}}/it.unimi.dsi_fastutil-7.0.12.jar{{PWD}}/org.projectlombok_lombok-1.16.8.jar{{PWD}}/com.google.guava_guava-31.1-jre.jar{{PWD}}/com.google.guava_failureaccess-1.0.1.jar{{PWD}}/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar{{PWD}}/com.google.errorprone_error_prone_annotations-2.16.jar{{PWD}}/com.google.j2objc_j2objc-annotations-1.3.jar{{PWD}}/com.google.http-client_google-http-client-1.42.3.jar{{PWD}}/io.opencensus_opencensus-contrib-http-util-0.31.1.jar{{PWD}}/com.google.http-client_google-http-client-jackson2-1.42.3.jar{{PWD}}/com.google.http-client_google-http-client-gson-1.42.3.jar{{PWD}}/com.google.api-client_google-api-client-2.1.1.jar{{PWD}}/commons-codec_commons-codec-1.15.jar{{PWD}}/com.google.oauth-client_google-oauth-client-1.34.1.jar{{PWD}}/com.google.http-client_google-http-client-apache-v2-1.42.3.jar{{PWD}}/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar{{PWD}}/com.google.code.gson_gson-2.10.jar{{PWD}}/com.google.cloud_google-cloud-core-2.9.0.jar{{PWD}}/com.google.auto.value_auto-value-annotations-1.10.1.jar{{PWD}}/com.google.cloud_google-cloud-core-http-2.9.0.jar{{PWD}}/com.google.http-client_google-http-client-appengine-1.42.3.jar{{PWD}}/com.google.api_gax-httpjson-0.105.1.jar{{PWD}}/com.google.cloud_google-cloud-core-grpc-2.9.0.jar{{PWD}}/io.grpc_grpc-core-1.51.0.jar{{PWD}}/com.google.api_gax-2.20.1.jar{{PWD}}/com.google.api_gax-grpc-2.20.1.jar{{PWD}}/io.grpc_grpc-alts-1.51.0.jar{{PWD}}/io.grpc_grpc-grpclb-1.51.0.jar{{PWD}}/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar{{PWD}}/io.grpc_grpc-protobuf-1.51.0.jar{{PWD}}/com.google.auth_google-auth-library-credentials-1.13.0.jar{{PWD}}/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar{{PWD}}/com.google.api_api-common-2.2.2.jar{{PWD}}/javax.annotation_javax.annotation-api-1.3.2.jar{{PWD}}/io.opencensus_opencensus-api-0.31.1.jar{{PWD}}/io.grpc_grpc-context-1.51.0.jar{{PWD}}/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar{{PWD}}/com.google.protobuf_protobuf-java-3.21.10.jar{{PWD}}/com.google.protobuf_protobuf-java-util-3.21.10.jar{{PWD}}/com.google.api.grpc_proto-google-common-protos-2.11.0.jar{{PWD}}/org.threeten_threetenbp-1.6.4.jar{{PWD}}/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.fasterxml.jackson.core_jackson-core-2.14.1.jar{{PWD}}/com.google.code.findbugs_jsr305-3.0.2.jar{{PWD}}/io.grpc_grpc-api-1.51.0.jar{{PWD}}/io.grpc_grpc-auth-1.51.0.jar{{PWD}}/io.grpc_grpc-stub-1.51.0.jar{{PWD}}/org.checkerframework_checker-qual-3.28.0.jar{{PWD}}/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar{{PWD}}/io.grpc_grpc-protobuf-lite-1.51.0.jar{{PWD}}/com.google.android_annotations-4.1.1.4.jar{{PWD}}/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar{{PWD}}/io.grpc_grpc-netty-shaded-1.51.0.jar{{PWD}}/io.perfmark_perfmark-api-0.26.0.jar{{PWD}}/io.grpc_grpc-googleapis-1.51.0.jar{{PWD}}/io.grpc_grpc-xds-1.51.0.jar{{PWD}}/io.opencensus_opencensus-proto-0.2.0.jar{{PWD}}/io.grpc_grpc-services-1.51.0.jar{{PWD}}/com.google.re2j_re2j-1.6.jar{{PWD}}/dk.brics.automaton_automaton-1.11-8.jar{{PWD}}/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.yarn.dist.jars',\n", + " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.dataproc.sql.parquet.enableFooterCache', 'true'),\n", + " ('spark.driver.memory', '4g'),\n", + " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", + " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", + " ('spark.yarn.am.memory', '640m'),\n", + " ('spark.cores.max', '4'),\n", + " ('spark.executor.cores', '4'),\n", + " ('spark.jars.packages',\n", + " 'com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0,graphframes:graphframes:0.8.2-spark3.1-s_2.12'),\n", + " ('spark.executor.instances', '2'),\n", + " ('spark.dataproc.listeners',\n", + " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", + " ('spark.serializer.objectStreamReset', '100'),\n", + " ('spark.submit.deployMode', 'client'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1699633504496_0001'),\n", + " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", + " ('spark.shuffle.service.enabled', 'true'),\n", + " ('spark.scheduler.mode', 'FAIR'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/64cc95a2-795c-4a54-be08-11a36626a92f/spark-job-history'),\n", + " ('spark.sql.adaptive.enabled', 'true'),\n", + " ('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),\n", + " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", + " ('spark.app.id', 'application_1699633504496_0001'),\n", + " ('spark.master', 'yarn'),\n", + " ('spark.ui.port', '0'),\n", + " ('spark.rpc.message.maxSize', '512'),\n", + " ('spark.rdd.compress', 'True'),\n", + " ('spark.task.maxFailures', '10'),\n", + " ('spark.yarn.isPython', 'true'),\n", + " ('spark.dynamicAllocation.enabled', 'true'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/64cc95a2-795c-4a54-be08-11a36626a92f/spark-job-history'),\n", + " ('spark.ui.showConsoleProgress', 'true')]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "spark = SparkSession.builder.appName('2021EDA').getOrCreate()\n", "\n", @@ -32,10 +127,89 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "a10a9fef-7517-4947-a7a5-b17db05dbb79", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 180:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- Trip ID: string (nullable = true)\n", + " |-- Trip Start Timestamp: string (nullable = true)\n", + " |-- Trip End Timestamp: string (nullable = true)\n", + " |-- Trip Seconds: integer (nullable = true)\n", + " |-- Trip Miles: double (nullable = true)\n", + " |-- Pickup Census Tract: long (nullable = true)\n", + " |-- Dropoff Census Tract: long (nullable = true)\n", + " |-- Pickup Community Area: integer (nullable = true)\n", + " |-- Dropoff Community Area: integer (nullable = true)\n", + " |-- Fare: double (nullable = true)\n", + " |-- Tip: integer (nullable = true)\n", + " |-- Additional Charges: string (nullable = true)\n", + " |-- Trip Total: double (nullable = true)\n", + " |-- Shared Trip Authorized: boolean (nullable = true)\n", + " |-- Trips Pooled: integer (nullable = true)\n", + " |-- Pickup Centroid Latitude: double (nullable = true)\n", + " |-- Pickup Centroid Longitude: double (nullable = true)\n", + " |-- Pickup Centroid Location: string (nullable = true)\n", + " |-- Dropoff Centroid Latitude: double (nullable = true)\n", + " |-- Dropoff Centroid Longitude: double (nullable = true)\n", + " |-- Dropoff Centroid Location: string (nullable = true)\n", + "\n", + "root\n", + " |-- name: string (nullable = true)\n", + " |-- datetime: string (nullable = true)\n", + " |-- tempmax: double (nullable = true)\n", + " |-- tempmin: double (nullable = true)\n", + " |-- temp: double (nullable = true)\n", + " |-- feelslikemax: double (nullable = true)\n", + " |-- feelslikemin: double (nullable = true)\n", + " |-- feelslike: double (nullable = true)\n", + " |-- dew: double (nullable = true)\n", + " |-- humidity: double (nullable = true)\n", + " |-- precip: double (nullable = true)\n", + " |-- precipprob: integer (nullable = true)\n", + " |-- precipcover: double (nullable = true)\n", + " |-- preciptype: string (nullable = true)\n", + " |-- snow: double (nullable = true)\n", + " |-- snowdepth: double (nullable = true)\n", + " |-- windgust: double (nullable = true)\n", + " |-- windspeed: double (nullable = true)\n", + " |-- winddir: double (nullable = true)\n", + " |-- sealevelpressure: double (nullable = true)\n", + " |-- cloudcover: double (nullable = true)\n", + " |-- visibility: double (nullable = true)\n", + " |-- solarradiation: double (nullable = true)\n", + " |-- solarenergy: double (nullable = true)\n", + " |-- uvindex: integer (nullable = true)\n", + " |-- severerisk: integer (nullable = true)\n", + " |-- sunrise: string (nullable = true)\n", + " |-- sunset: string (nullable = true)\n", + " |-- moonphase: double (nullable = true)\n", + " |-- conditions: string (nullable = true)\n", + " |-- description: string (nullable = true)\n", + " |-- icon: string (nullable = true)\n", + " |-- stations: string (nullable = true)\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "df_2021 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/2021\", inferSchema=True, header=True)\n", "# figure out how to read in shp file msca-bdp-student-gcs/bdp-rideshare-project/neighborhoods/shp files\n", @@ -46,10 +220,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "8138c57a-26d6-44c4-b765-c7b137277044", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "#display number of records by partition\n", "def displaypartitions(df):\n", @@ -67,17 +252,56 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "e70c86dd-041c-4967-b726-c058e32a76b7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Partitions: 6\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 4:=================================================> (5 + 1) / 6]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+------+\n", + "|partitionId| count|\n", + "+-----------+------+\n", + "| 5| 69847|\n", + "| 4|523726|\n", + "| 3|527064|\n", + "| 1|527581|\n", + "| 2|528719|\n", + "| 0|531719|\n", + "+-----------+------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "displaypartitions(df_2021)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "fe004162-5b22-4a11-9fad-665fa5cdecc0", "metadata": {}, "outputs": [], @@ -87,30 +311,138 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "f34f9ec5-1a72-42ed-8bbe-3b54683a8bf4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 7:=================================================> (5 + 1) / 6]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Partitions: 10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 13:=====================================================>(199 + 1) / 200]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+------+\n", + "|partitionId| count|\n", + "+-----------+------+\n", + "| 5|270864|\n", + "| 0|270865|\n", + "| 6|270865|\n", + "| 7|270865|\n", + "| 1|270866|\n", + "| 4|270866|\n", + "| 8|270866|\n", + "| 9|270866|\n", + "| 2|270866|\n", + "| 3|270867|\n", + "+-----------+------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "displaypartitions(df_2021)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "22a6039e-9848-4717-98b6-bc915540357b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 16:===================================================> (9 + 1) / 10]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+----------------+--------------------+--------------------+-----------------+-----------------+--------------------+--------------------+---------------------+----------------------+------------------+------------------+------------------+------------------+--------------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+\n", + "|summary| Trip ID|Trip Start Timestamp| Trip End Timestamp| Trip Seconds| Trip Miles| Pickup Census Tract|Dropoff Census Tract|Pickup Community Area|Dropoff Community Area| Fare| Tip|Additional Charges| Trip Total| Trips Pooled|Pickup Centroid Latitude|Pickup Centroid Longitude|Pickup Centroid Location|Dropoff Centroid Latitude|Dropoff Centroid Longitude|Dropoff Centroid Location|\n", + "+-------+----------------+--------------------+--------------------+-----------------+-----------------+--------------------+--------------------+---------------------+----------------------+------------------+------------------+------------------+------------------+--------------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+\n", + "| count| 2708656| 2708652| 2708652| 2708652| 2708649| 983470| 975128| 2479944| 2447038| 2708100| 2708100| 2708099| 2708098| 2708651| 2482320| 2482320| 2482320| 2449465| 2449465| 2449465|\n", + "| mean| null| null| null|942.2844891111889|6.688875228942609|1.703138547767500...|1.703138094929255...| 29.80978683389625| 29.28494816999164|13.057426793692995|0.4912218898858979|3.3111284709689235|16.859785314286004| 1.0000036918746638| 41.87740670463228| -87.67185372619028| null| 41.87714315855053| -87.6694694706963| null|\n", + "| stddev| null| null| null|630.3003334457269|7.608257555036324| 339740.56288590573| 335430.99940793717| 22.278028203042062| 21.90521219841838| 9.839634058047901|1.6231638297086177| 2.236087560001195|11.125341127829076|0.001921421972618...| 0.07804151182131637| 0.06527591183128632| null| 0.07698901524173858| 0.06276134276291706| null|\n", + "| min| \"error\" : true|01/01/2021 01:00:...|01/01/2021 01:00:...| 5| 0.0| 17031010100| 17031010100| 1| 1| 0.0| 0| 0| 0.0| 1| 41.6502216756| -87.913624596| POINT (-87.530712...| 41.6502216756| -87.913624596| POINT (-87.529950...|\n", + "| max| }|01/26/2021 12:45:...|01/26/2021 12:45:...| 64707| 661.2| 17031980100| 17031980100| 77| 77| 840.0| 100| {| 845.85| 2| 42.0212235931| -87.5307124836| POINT (-87.913624...| 42.0212235931| -87.529950466| POINT (-87.913624...|\n", + "+-------+----------------+--------------------+--------------------+-----------------+-----------------+--------------------+--------------------+---------------------+----------------------+------------------+------------------+------------------+------------------+--------------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "df_2021.describe().show()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "c78e4618-8383-4df2-862b-4cb9dbeb20ab", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 22:==================================> (6 + 4) / 10]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+--------------------+------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+\n", + "|Trip ID|Trip Start Timestamp|Trip End Timestamp|Trip Seconds|Trip Miles|Pickup Census Tract|Dropoff Census Tract|Pickup Community Area|Dropoff Community Area|Fare|Tip|Additional Charges|Trip Total|Shared Trip Authorized|Trips Pooled|Pickup Centroid Latitude|Pickup Centroid Longitude|Pickup Centroid Location|Dropoff Centroid Latitude|Dropoff Centroid Longitude|Dropoff Centroid Location|\n", + "+-------+--------------------+------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+\n", + "| 0| 4| 4| 4| 7| 1725186| 1733528| 228712| 261618| 556|556| 557| 558| 5| 5| 226336| 226336| 226336| 259191| 259191| 259191|\n", + "+-------+--------------------+------------------+------------+----------+-------------------+--------------------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------+-------------------------+--------------------------+-------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "#Find the number of missing values for each column\n", "from pyspark.sql.functions import isnan, when, count, col\n", @@ -119,26 +451,376 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "2dd6ea75-5417-4d27-92bb-4d9a24808545", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "921081" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# number of observations with all the data in each column\n", - "df_2021.dropna().count()" + "df_2021.dropna(how='any').count()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "46e2e9e5-3581-444c-b149-827a5cbc62f5", "metadata": {}, "outputs": [], "source": [ "# Working with just data that contains full information and check for dupes\n", - "df_2021 = df_2021.dropna()\n", + "df_2021 = df_2021.dropna(how='any')\n", "df_2021 = df_2021.dropDuplicates()" ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6a52d857-bc07-47a7-8cc9-71478bf10e08", + "metadata": {}, + "outputs": [], + "source": [ + "# Drop columns unlikely to be useful for analysis for speed of computation and rename columns to remove spacing for ease of code writing\n", + "spark.conf.set(\"spark.sql.legacy.timeParserPolicy\", \"LEGACY\")\n", + "\n", + "df_2021 = df_2021.drop('Trip Seconds','Trips Pooled','Additional Charges','Shared Trip Authorized')\n", + "df_2021 = df_2021.withColumnRenamed(\"Trip ID\",\"ID\").withColumnRenamed(\"Trip Start Timestamp\",\"start_timestamp\").withColumnRenamed(\"Trip End Timestamp\",\"end_timestamp\").withColumnRenamed(\"Trip Miles\",\\\n", + " \"miles\").withColumnRenamed(\"Pickup Census Tract\",\"pickup_tract\").withColumnRenamed(\"Dropoff Census Tract\",\"dropoff_tract\").withColumnRenamed(\"Pickup Community Area\",\"pickup_area\"\\\n", + " ).withColumnRenamed(\"Dropoff Community Area\",\"dropoff_area\").withColumnRenamed(\"Trip Total\",\"total\").withColumnRenamed(\"Pickup Centroid Latitude\",\"pickup_lat\").withColumnRenamed(\\\n", + " \"Pickup Centroid Longitude\",\"pickup_lon\").withColumnRenamed(\"Pickup Centroid Location\",\"pickup_location\").withColumnRenamed(\"Dropoff Centroid Latitude\",\"dropoff_lat\").withColumnRenamed(\\\n", + " \"Dropoff Centroid Longitude\",\"dropoff_lon\").withColumnRenamed(\"Dropoff Centroid Location\",\"dropoff_location\")\n", + "# fix datatypes\n", + "df_2021 = df_2021.withColumn('start_timestamp', F.to_timestamp(df_2021['start_timestamp'], 'MM/DD/YYYY HH:mm:ss AM/PM')).withColumn('end_timestamp', F.to_timestamp(df_2021['end_timestamp'], 'MM/DD/YYYY HH:mm:ss AM/PM'))\n", + "df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], \"MM/dd/yyyy\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "17ebe569-82bb-49b0-aaaf-ac5062bede35", + "metadata": {}, + "outputs": [], + "source": [ + "df_2021 = df_2021.withColumn('month', F.month(df_2021.start_timestamp))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "88ed9106-1b80-472e-8638-ece1d3d5d25e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 46:=============================================> (8 + 2) / 10]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+\n", + "| ID| start_timestamp| end_timestamp|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total| pickup_lat| pickup_lon| pickup_location| dropoff_lat| dropoff_lon| dropoff_location|\n", + "+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+\n", + "|960742b2998c7908e...|2021-01-04 01:45:00|2021-01-04 02:00:00| 5.9| 17031063000| 17031830600| 6| 1|15.0| 0|16.23|41.9363101308|-87.6515625922|POINT (-87.651562...|42.0016981937|-87.6735740325|POINT (-87.673574...|\n", + "|97d44c73c92c79cc4...|2021-01-04 04:30:00|2021-01-04 05:00:00| 7.3| 17031081600| 17031835900| 8| 38|22.5| 0|25.48|41.8920726347|-87.6288741572|POINT (-87.628874...|41.8201666062| -87.621499206|POINT (-87.621499...|\n", + "|d460b58d94cbf8f53...|2021-01-04 01:00:00|2021-01-04 01:15:00| 7.6| 17031980000| 17031110200| 76| 11|17.5| 0|23.73|41.9790708201|-87.9030396611|POINT (-87.903039...|41.9800778511|-87.7734709274|POINT (-87.773470...|\n", + "|ac48c66511eecd2b1...|2021-01-04 04:45:00|2021-01-04 05:00:00| 6.3| 17031330100| 17031070700| 33| 7|20.0| 0|24.85| 41.859349715|-87.6173580061|POINT (-87.617358...|41.9292725315|-87.6738072384|POINT (-87.673807...|\n", + "|21c328431f2e2928c...|2021-01-01 10:15:00|2021-01-01 10:30:00| 1.3| 17031081700| 17031081401| 8| 8|10.0| 1| 14.1|41.8920421365|-87.6318639497|POINT (-87.631863...|41.8950334495|-87.6197106717|POINT (-87.619710...|\n", + "+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "df_2021.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "be4d107a-96c5-4a2b-96eb-4e408f6a8f42", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+----------+-------+-------+----+------------+------------+---------+----+--------+------+----------+-----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+--------------+-----------+-------+----------+-------------------+-------------------+---------+--------------------+--------------------+-----------------+--------------------+\n", + "| name| datetime|tempmax|tempmin|temp|feelslikemax|feelslikemin|feelslike| dew|humidity|precip|precipprob|precipcover|preciptype|snow|snowdepth|windgust|windspeed|winddir|sealevelpressure|cloudcover|visibility|solarradiation|solarenergy|uvindex|severerisk| sunrise| sunset|moonphase| conditions| description| icon| stations|\n", + "+-------+----------+-------+-------+----+------------+------------+---------+----+--------+------+----------+-----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+--------------+-----------+-------+----------+-------------------+-------------------+---------+--------------------+--------------------+-----------------+--------------------+\n", + "|chicago|2020-01-01| 43.0| 22.2|32.9| 35.2| 12.8| 24.5|24.1| 71.1| 0.0| 0| 0.0| null| 0.0| 0.4| 31.8| 19.6| 207.9| 1007.3| 32.0| 9.9| 98.7| 8.5| 4| null|2020-01-01T07:18:18|2020-01-01T16:29:47| 0.21| Partially cloudy|Partly cloudy thr...|partly-cloudy-day|72534014819,KORD,...|\n", + "|chicago|2020-01-02| 48.0| 37.3|42.8| 43.2| 29.0| 36.3|32.9| 68.0| 0.0| 0| 0.0| null| 0.0| 0.1| 30.5| 18.4| 214.1| 1002.4| 43.2| 9.9| 85.5| 7.4| 4| null|2020-01-02T07:18:24|2020-01-02T16:30:39| 0.25| Partially cloudy|Partly cloudy thr...|partly-cloudy-day|72534014819,KORD,...|\n", + "|chicago|2020-01-03| 41.5| 34.5|37.3| 38.1| 28.4| 32.7|30.4| 76.0| 0.0| 0| 0.0| null| 0.0| 0.0| null| 8.7| 335.7| 1009.8| 91.4| 9.7| 27.3| 2.4| 1| null|2020-01-03T07:18:26|2020-01-03T16:31:33| 0.27| Overcast|Cloudy skies thro...| cloudy|72534014819,KORD,...|\n", + "|chicago|2020-01-04| 34.5| 28.0|31.6| 28.3| 18.6| 23.5|24.0| 73.4| 0.015| 100| 12.5| rain,snow| 0.1| 0.0| 20.8| 13.8| 313.1| 1016.5| 89.8| 8.4| 20.5| 1.8| 2| null|2020-01-04T07:18:27|2020-01-04T16:32:28| 0.31|Snow, Rain, Parti...|Partly cloudy thr...| snow|72534014819,KORD,...|\n", + "|chicago|2020-01-05| 42.9| 25.0|33.5| 35.5| 13.9| 24.1|25.3| 72.4| 0.0| 0| 0.0| null| 0.0| 0.0| 36.9| 21.2| 238.8| 1016.2| 73.0| 9.8| 62.1| 5.5| 4| null|2020-01-05T07:18:25|2020-01-05T16:33:24| 0.34| Partially cloudy|Partly cloudy thr...|partly-cloudy-day|72534014819,KORD,...|\n", + "+-------+----------+-------+-------+----+------------+------------+---------+----+--------+------+----------+-----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+--------------+-----------+-------+----------+-------------------+-------------------+---------+--------------------+--------------------+-----------------+--------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], \"yyyy-mm-dd\"))\n", + "df_weather.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2ae6d6b2-9001-4ded-ba3a-225861cbe451", + "metadata": {}, + "outputs": [], + "source": [ + "hp_census_tracts_2010_2020 = [17031411000,17031410900,17031410100,17031411100,17031410800,17031410200,17031410700,17031411200,17031836200,17031410600,17031836300,17031410500,\n", + " 17031410300,17031410400,17031410600,17031410800,17031411300,17031411400]\n", + "df_hp = df_2021.filter((df_2021.pickup_tract.isin(hp_census_tracts_2010_2020)) & (df_2021.dropoff_tract.isin(hp_census_tracts_2010_2020)))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "67a9c9c1-dd4e-41b6-9d29-b475a1189268", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 61:================================================> (5 + 1) / 6]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+-----+\n", + "| ID| start_timestamp| end_timestamp|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total| pickup_lat| pickup_lon| pickup_location| dropoff_lat| dropoff_lon| dropoff_location|month|\n", + "+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+-----+\n", + "|edb44ac5f0ca61fcd...|2021-01-17 04:15:00|2021-01-17 04:15:00| 1.4| 17031410800| 17031836200| 41| 41| 5.0| 0| 8.1|41.7979652088|-87.5896070309|POINT (-87.589607...|41.7904693995|-87.6012851221|POINT (-87.601285...| 1|\n", + "|a6ff5154d2a4e8b45...|2021-01-22 03:00:00|2021-01-22 03:00:00| 0.9| 17031836200| 17031410600| 41| 41| 5.0| 0| 8.1|41.7904693995|-87.6012851221|POINT (-87.601285...|41.7979711911|-87.5989445134|POINT (-87.598944...| 1|\n", + "|b0a0ccd90ca248d40...|2021-01-21 02:45:00|2021-01-21 03:00:00| 1.5| 17031836200| 17031410100| 41| 41| 7.5| 0| 10.6|41.7904693995|-87.6012851221|POINT (-87.601285...|41.8012268363|-87.5853031602|POINT (-87.585303...| 1|\n", + "|75d9b7289cd03afa9...|2021-01-04 01:30:00|2021-01-04 01:30:00| 0.9| 17031410700| 17031836200| 41| 41| 2.5| 0| 6.38|41.7980417164|-87.5941966274|POINT (-87.594196...|41.7904693995|-87.6012851221|POINT (-87.601285...| 1|\n", + "|24aa15d3c6be5ad06...|2021-01-22 07:15:00|2021-01-22 07:15:00| 0.9| 17031836200| 17031411000| 41| 41| 7.5| 1| 11.6|41.7904693995|-87.6012851221|POINT (-87.601285...|41.7905062613|-87.5831437169|POINT (-87.583143...| 1|\n", + "+--------------------+-------------------+-------------------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+--------------------+-------------+--------------+--------------------+-----+\n", + "only showing top 5 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "df_hp.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "2c54ef92-e61e-4827-ad6c-bb8b9405e701", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monthcount(ID)
011825
\n", + "
" + ], + "text/plain": [ + " month count(ID)\n", + "0 1 1825" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_hp.groupby(\"month\").agg({'ID':'count'}).orderBy(F.col('month').asc()).toPandas() #.plot(x=\"month\",y=\"count(ID)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "b1ed924a-e358-4ead-852e-6026d23fb8ad", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "921081" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_2021.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "024ff08a-42d2-4a8f-8189-c35d6af0186a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "1825" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_hp.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2fda94cd-990c-436b-93b0-979f7e3c8ad3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+\n", + "|month|\n", + "+-----+\n", + "| 1|\n", + "+-----+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+\n", + "|month|\n", + "+-----+\n", + "| 1|\n", + "+-----+\n", + "\n" + ] + } + ], + "source": [ + "df_2021.select('month').distinct().show()\n", + "df_hp.select('month').distinct().show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "765edf2e-10ad-4fda-870d-8e9a488cc7ff", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {