From 65fcfa068bf477d9fae8eadf7048863af5d64cfc Mon Sep 17 00:00:00 2001 From: root Date: Wed, 22 Nov 2023 21:13:30 +0000 Subject: [PATCH] Continued step 1, 2 and weather --- supervised_ml.ipynb | 1755 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 1707 insertions(+), 48 deletions(-) diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb index 5353dfe..78a1715 100644 --- a/supervised_ml.ipynb +++ b/supervised_ml.ipynb @@ -28,58 +28,55 @@ " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", " ('spark.kryoserializer.buffer.max', '2000M'),\n", - " ('spark.history.fs.logDirectory',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/bbc53cbc-8fb2-40b7-b9b3-62fefb76a670/spark-job-history'),\n", + " ('spark.driver.port', '42361'),\n", " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", " ('spark.driver.maxResultSize', '0'),\n", " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", - " ('spark.sql.autoBroadcastJoinThreshold', '43m'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", + " 'http://hub-msca-bdp-dphub-students-harshpachisia-m:8088/proxy/application_1700673289776_0003'),\n", " ('spark.ui.filters',\n", " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", " ('spark.metrics.namespace',\n", " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", " ('spark.executor.memory', '4g'),\n", " ('spark.dataproc.sql.optimizer.leftsemijoin.conversion.enabled', 'true'),\n", - " ('spark.driver.port', '34671'),\n", " ('spark.hadoop.hive.execution.engine', 'mr'),\n", " ('spark.executor.id', 'driver'),\n", " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", - " 'hub-msca-bdp-dphub-students-abejburton-m'),\n", - " ('spark.ui.proxyBase', '/proxy/application_1700503299623_0001'),\n", - " ('spark.app.id', 'application_1700503299623_0001'),\n", + " ('spark.yarn.historyServer.address',\n", + " 'hub-msca-bdp-dphub-students-harshpachisia-m:18080'),\n", " ('spark.yarn.dist.pyFiles',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", - " 'hub-msca-bdp-dphub-students-abejburton-m'),\n", " ('spark.yarn.am.attemptFailuresValidityInterval', '1h'),\n", " ('spark.app.name', 'Spark Updated Conf'),\n", " ('spark.sql.catalogImplementation', 'hive'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/82d9ba70-b4ec-4813-be2a-b9d68f92ad04/spark-job-history'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", + " 'hub-msca-bdp-dphub-students-harshpachisia-m'),\n", " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", " ('spark.yarn.secondary.jars',\n", " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.driver.appUIAddress',\n", + " 'http://hub-msca-bdp-dphub-students-harshpachisia-m.c.msca-bdp-student-ap.internal:33471'),\n", " ('spark.repl.local.jars',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.driver.host',\n", - " 'hub-msca-bdp-dphub-students-abejburton-m.c.msca-bdp-student-ap.internal'),\n", + " ('spark.app.startTime', '1700678618909'),\n", " ('spark.sql.cbo.enabled', 'true'),\n", - " ('spark.yarn.historyServer.address',\n", - " 'hub-msca-bdp-dphub-students-abejburton-m:18080'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", - " 'http://hub-msca-bdp-dphub-students-abejburton-m:8088/proxy/application_1700503299623_0001'),\n", " ('spark.executorEnv.PYTHONPATH',\n", " '/usr/lib/spark/python/lib/py4j-0.10.9-src.zip:/usr/lib/spark/python/:{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar{{PWD}}/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar{{PWD}}/com.typesafe_config-1.4.2.jar{{PWD}}/org.rocksdb_rocksdbjni-6.29.5.jar{{PWD}}/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar{{PWD}}/com.github.universal-automata_liblevenshtein-3.0.0.jar{{PWD}}/com.google.cloud_google-cloud-storage-2.16.0.jar{{PWD}}/com.navigamez_greex-1.0.jar{{PWD}}/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar{{PWD}}/it.unimi.dsi_fastutil-7.0.12.jar{{PWD}}/org.projectlombok_lombok-1.16.8.jar{{PWD}}/com.google.guava_guava-31.1-jre.jar{{PWD}}/com.google.guava_failureaccess-1.0.1.jar{{PWD}}/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar{{PWD}}/com.google.errorprone_error_prone_annotations-2.16.jar{{PWD}}/com.google.j2objc_j2objc-annotations-1.3.jar{{PWD}}/com.google.http-client_google-http-client-1.42.3.jar{{PWD}}/io.opencensus_opencensus-contrib-http-util-0.31.1.jar{{PWD}}/com.google.http-client_google-http-client-jackson2-1.42.3.jar{{PWD}}/com.google.http-client_google-http-client-gson-1.42.3.jar{{PWD}}/com.google.api-client_google-api-client-2.1.1.jar{{PWD}}/commons-codec_commons-codec-1.15.jar{{PWD}}/com.google.oauth-client_google-oauth-client-1.34.1.jar{{PWD}}/com.google.http-client_google-http-client-apache-v2-1.42.3.jar{{PWD}}/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar{{PWD}}/com.google.code.gson_gson-2.10.jar{{PWD}}/com.google.cloud_google-cloud-core-2.9.0.jar{{PWD}}/com.google.auto.value_auto-value-annotations-1.10.1.jar{{PWD}}/com.google.cloud_google-cloud-core-http-2.9.0.jar{{PWD}}/com.google.http-client_google-http-client-appengine-1.42.3.jar{{PWD}}/com.google.api_gax-httpjson-0.105.1.jar{{PWD}}/com.google.cloud_google-cloud-core-grpc-2.9.0.jar{{PWD}}/io.grpc_grpc-core-1.51.0.jar{{PWD}}/com.google.api_gax-2.20.1.jar{{PWD}}/com.google.api_gax-grpc-2.20.1.jar{{PWD}}/io.grpc_grpc-alts-1.51.0.jar{{PWD}}/io.grpc_grpc-grpclb-1.51.0.jar{{PWD}}/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar{{PWD}}/io.grpc_grpc-protobuf-1.51.0.jar{{PWD}}/com.google.auth_google-auth-library-credentials-1.13.0.jar{{PWD}}/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar{{PWD}}/com.google.api_api-common-2.2.2.jar{{PWD}}/javax.annotation_javax.annotation-api-1.3.2.jar{{PWD}}/io.opencensus_opencensus-api-0.31.1.jar{{PWD}}/io.grpc_grpc-context-1.51.0.jar{{PWD}}/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar{{PWD}}/com.google.protobuf_protobuf-java-3.21.10.jar{{PWD}}/com.google.protobuf_protobuf-java-util-3.21.10.jar{{PWD}}/com.google.api.grpc_proto-google-common-protos-2.11.0.jar{{PWD}}/org.threeten_threetenbp-1.6.4.jar{{PWD}}/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.fasterxml.jackson.core_jackson-core-2.14.1.jar{{PWD}}/com.google.code.findbugs_jsr305-3.0.2.jar{{PWD}}/io.grpc_grpc-api-1.51.0.jar{{PWD}}/io.grpc_grpc-auth-1.51.0.jar{{PWD}}/io.grpc_grpc-stub-1.51.0.jar{{PWD}}/org.checkerframework_checker-qual-3.28.0.jar{{PWD}}/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar{{PWD}}/io.grpc_grpc-protobuf-lite-1.51.0.jar{{PWD}}/com.google.android_annotations-4.1.1.4.jar{{PWD}}/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar{{PWD}}/io.grpc_grpc-netty-shaded-1.51.0.jar{{PWD}}/io.perfmark_perfmark-api-0.26.0.jar{{PWD}}/io.grpc_grpc-googleapis-1.51.0.jar{{PWD}}/io.grpc_grpc-xds-1.51.0.jar{{PWD}}/io.opencensus_opencensus-proto-0.2.0.jar{{PWD}}/io.grpc_grpc-services-1.51.0.jar{{PWD}}/com.google.re2j_re2j-1.6.jar{{PWD}}/dk.brics.automaton_automaton-1.11-8.jar{{PWD}}/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.driver.host',\n", + " 'hub-msca-bdp-dphub-students-harshpachisia-m.c.msca-bdp-student-ap.internal'),\n", " ('spark.yarn.dist.jars',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.dataproc.sql.parquet.enableFooterCache', 'true'),\n", " ('spark.driver.memory', '4g'),\n", - " ('spark.eventLog.dir',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/bbc53cbc-8fb2-40b7-b9b3-62fefb76a670/spark-job-history'),\n", " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", " ('spark.yarn.am.memory', '640m'),\n", + " ('spark.app.id', 'application_1700673289776_0003'),\n", " ('spark.cores.max', '4'),\n", " ('spark.executor.cores', '4'),\n", " ('spark.jars.packages',\n", @@ -87,21 +84,24 @@ " ('spark.executor.instances', '2'),\n", " ('spark.dataproc.listeners',\n", " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", - " ('spark.driver.appUIAddress',\n", - " 'http://hub-msca-bdp-dphub-students-abejburton-m.c.msca-bdp-student-ap.internal:38579'),\n", + " ('spark.sql.autoBroadcastJoinThreshold', '90m'),\n", " ('spark.serializer.objectStreamReset', '100'),\n", " ('spark.submit.deployMode', 'client'),\n", " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", " ('spark.shuffle.service.enabled', 'true'),\n", - " ('spark.app.startTime', '1700503719000'),\n", " ('spark.scheduler.mode', 'FAIR'),\n", " ('spark.sql.adaptive.enabled', 'true'),\n", " ('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),\n", " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/82d9ba70-b4ec-4813-be2a-b9d68f92ad04/spark-job-history'),\n", " ('spark.master', 'yarn'),\n", " ('spark.ui.port', '0'),\n", " ('spark.rpc.message.maxSize', '512'),\n", " ('spark.rdd.compress', 'True'),\n", + " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", + " 'hub-msca-bdp-dphub-students-harshpachisia-m'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1700673289776_0003'),\n", " ('spark.task.maxFailures', '10'),\n", " ('spark.yarn.isPython', 'true'),\n", " ('spark.dynamicAllocation.enabled', 'true'),\n", @@ -130,6 +130,14 @@ "spark.sparkContext.getConf().getAll()" ] }, + { + "cell_type": "markdown", + "id": "17ac5c49-7dbc-4aac-a702-1a5ac3ee0097", + "metadata": {}, + "source": [ + "### Reading in cleaned data, partitioning" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -164,33 +172,1318 @@ "source": [ "# read in rideshare data for all years, concatenate, create appropriate partitioning\n", "# we are dropping 2020 because covid will affect the performance of our model\n", + "\n", "df_2018 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2018.csv\", inferSchema=True, header=True)\n", "df_2019 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2019.csv\", inferSchema=True, header=True)\n", "df_2021 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2021.csv\", inferSchema=True, header=True)\n", "df_2022 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2022.csv\", inferSchema=True, header=True)\n", - "df_all = df_2018.union(df_2019).union(df_2021).union(df_2022)\n", + "df_2023 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2023.csv\", inferSchema=True, header=True)\n", + "\n", + "# dropping new columns in 2023\n", + "df_2023 = df_2023.drop('Shared Trip Match','Percent Time Chicago','Percent Distance Chicago')\n", + "\n", + "df_all = df_2018.union(df_2019).union(df_2021).union(df_2022).union(df_2023)\n", "df_all.show(5)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, + "id": "18e30586-4bdd-4217-b55d-e41522df062b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Partitions: 534\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 31:=====================================================>(532 + 2) / 534]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+------+\n", + "|partitionId| count|\n", + "+-----------+------+\n", + "| 33|152646|\n", + "| 233|328837|\n", + "| 232|328975|\n", + "| 231|329131|\n", + "| 230|329163|\n", + "| 229|329209|\n", + "| 227|329245|\n", + "| 228|329263|\n", + "| 225|329263|\n", + "| 224|329311|\n", + "| 226|329315|\n", + "| 222|329332|\n", + "| 223|329344|\n", + "| 221|329373|\n", + "| 218|329389|\n", + "| 219|329390|\n", + "| 217|329399|\n", + "| 216|329410|\n", + "| 215|329410|\n", + "| 214|329418|\n", + "| 220|329427|\n", + "| 213|329428|\n", + "| 210|329461|\n", + "| 212|329481|\n", + "| 211|329505|\n", + "| 207|329507|\n", + "| 208|329513|\n", + "| 209|329519|\n", + "| 206|329523|\n", + "| 204|329533|\n", + "| 203|329555|\n", + "| 205|329574|\n", + "| 201|329587|\n", + "| 202|329591|\n", + "| 198|329607|\n", + "| 200|329623|\n", + "| 196|329624|\n", + "| 199|329630|\n", + "| 197|329633|\n", + "| 195|329646|\n", + "| 192|329654|\n", + "| 194|329673|\n", + "| 193|329678|\n", + "| 184|329704|\n", + "| 191|329708|\n", + "| 190|329712|\n", + "| 181|329717|\n", + "| 179|329728|\n", + "| 178|329730|\n", + "| 189|329732|\n", + "| 183|329732|\n", + "| 188|329739|\n", + "| 180|329746|\n", + "| 185|329748|\n", + "| 187|329786|\n", + "| 186|329786|\n", + "| 176|329786|\n", + "| 177|329787|\n", + "| 182|329793|\n", + "| 173|329804|\n", + "| 174|329805|\n", + "| 169|329807|\n", + "| 172|329814|\n", + "| 171|329819|\n", + "| 175|329843|\n", + "| 168|329849|\n", + "| 167|329861|\n", + "| 170|329865|\n", + "| 162|329866|\n", + "| 163|329871|\n", + "| 161|329879|\n", + "| 165|329889|\n", + "| 164|329892|\n", + "| 166|329900|\n", + "| 160|329937|\n", + "| 158|329956|\n", + "| 152|329961|\n", + "| 157|329962|\n", + "| 154|329965|\n", + "| 159|329974|\n", + "| 155|329980|\n", + "| 153|329985|\n", + "| 149|329993|\n", + "| 156|330000|\n", + "| 148|330005|\n", + "| 151|330014|\n", + "| 147|330021|\n", + "| 150|330027|\n", + "| 146|330059|\n", + "| 144|330066|\n", + "| 137|330070|\n", + "| 143|330083|\n", + "| 140|330084|\n", + "| 145|330084|\n", + "| 136|330089|\n", + "| 142|330090|\n", + "| 139|330099|\n", + "| 141|330101|\n", + "| 138|330103|\n", + "| 134|330116|\n", + "| 135|330129|\n", + "| 130|330133|\n", + "| 133|330135|\n", + "| 131|330139|\n", + "| 132|330148|\n", + "| 129|330159|\n", + "| 125|330178|\n", + "| 127|330181|\n", + "| 122|330186|\n", + "| 128|330189|\n", + "| 126|330205|\n", + "| 119|330208|\n", + "| 123|330219|\n", + "| 115|330220|\n", + "| 118|330227|\n", + "| 124|330230|\n", + "| 117|330236|\n", + "| 120|330243|\n", + "| 114|330245|\n", + "| 107|330249|\n", + "| 121|330252|\n", + "| 111|330255|\n", + "| 112|330258|\n", + "| 116|330268|\n", + "| 113|330270|\n", + "| 108|330275|\n", + "| 105|330282|\n", + "| 103|330282|\n", + "| 110|330296|\n", + "| 102|330297|\n", + "| 109|330306|\n", + "| 104|330308|\n", + "| 94|330338|\n", + "| 106|330339|\n", + "| 99|330355|\n", + "| 95|330363|\n", + "| 101|330366|\n", + "| 100|330372|\n", + "| 98|330383|\n", + "| 97|330385|\n", + "| 96|330402|\n", + "| 86|330413|\n", + "| 92|330415|\n", + "| 90|330418|\n", + "| 88|330418|\n", + "| 91|330419|\n", + "| 84|330425|\n", + "| 87|330426|\n", + "| 89|330430|\n", + "| 93|330430|\n", + "| 85|330436|\n", + "| 80|330456|\n", + "| 82|330470|\n", + "| 78|330471|\n", + "| 81|330474|\n", + "| 83|330477|\n", + "| 77|330494|\n", + "| 76|330501|\n", + "| 79|330512|\n", + "| 74|330524|\n", + "| 72|330550|\n", + "| 70|330570|\n", + "| 75|330570|\n", + "| 71|330579|\n", + "| 73|330585|\n", + "| 68|330622|\n", + "| 69|330625|\n", + "| 66|330625|\n", + "| 65|330642|\n", + "| 67|330646|\n", + "| 61|330651|\n", + "| 64|330653|\n", + "| 63|330667|\n", + "| 60|330690|\n", + "| 55|330704|\n", + "| 62|330709|\n", + "| 57|330721|\n", + "| 56|330724|\n", + "| 59|330737|\n", + "| 52|330758|\n", + "| 54|330762|\n", + "| 53|330762|\n", + "| 58|330766|\n", + "| 50|330782|\n", + "| 49|330784|\n", + "| 46|330801|\n", + "| 47|330805|\n", + "| 51|330807|\n", + "| 48|330837|\n", + "| 43|330868|\n", + "| 45|330869|\n", + "| 44|330885|\n", + "| 42|330918|\n", + "| 40|330944|\n", + "| 41|330963|\n", + "| 39|331028|\n", + "| 38|331034|\n", + "| 37|331050|\n", + "| 36|331114|\n", + "| 35|331284|\n", + "| 34|331416|\n", + "| 533|364094|\n", + "| 532|364374|\n", + "| 531|364493|\n", + "| 527|364581|\n", + "| 528|364599|\n", + "| 529|364616|\n", + "| 530|364617|\n", + "| 526|364654|\n", + "| 524|364709|\n", + "| 525|364756|\n", + "| 522|364784|\n", + "| 523|364810|\n", + "| 519|364899|\n", + "| 520|364903|\n", + "| 521|364944|\n", + "| 518|364957|\n", + "| 517|364961|\n", + "| 514|364971|\n", + "| 515|364988|\n", + "| 516|365006|\n", + "| 512|365011|\n", + "| 513|365051|\n", + "| 511|365057|\n", + "| 510|365079|\n", + "| 508|365083|\n", + "| 507|365090|\n", + "| 509|365097|\n", + "| 506|365122|\n", + "| 504|365165|\n", + "| 505|365179|\n", + "| 503|365224|\n", + "| 499|365252|\n", + "| 496|365253|\n", + "| 501|365255|\n", + "| 498|365272|\n", + "| 500|365277|\n", + "| 502|365278|\n", + "| 497|365302|\n", + "| 495|365347|\n", + "| 492|365377|\n", + "| 493|365394|\n", + "| 494|365395|\n", + "| 491|365409|\n", + "| 490|365431|\n", + "| 488|365447|\n", + "| 489|365454|\n", + "| 487|365519|\n", + "| 486|365528|\n", + "| 485|365536|\n", + "| 482|365541|\n", + "| 479|365547|\n", + "| 478|365552|\n", + "| 477|365554|\n", + "| 480|365569|\n", + "| 483|365574|\n", + "| 474|365576|\n", + "| 484|365595|\n", + "| 475|365602|\n", + "| 481|365622|\n", + "| 476|365622|\n", + "| 473|365650|\n", + "| 472|365684|\n", + "| 471|365705|\n", + "| 469|365750|\n", + "| 468|365773|\n", + "| 467|365793|\n", + "| 470|365801|\n", + "| 464|365806|\n", + "| 465|365806|\n", + "| 463|365828|\n", + "| 466|365846|\n", + "| 462|365909|\n", + "| 461|365965|\n", + "| 460|365975|\n", + "| 459|366026|\n", + "| 456|366051|\n", + "| 457|366057|\n", + "| 458|366080|\n", + "| 454|366105|\n", + "| 455|366117|\n", + "| 452|366150|\n", + "| 453|366160|\n", + "| 448|366193|\n", + "| 451|366200|\n", + "| 450|366214|\n", + "| 449|366217|\n", + "| 446|366297|\n", + "| 447|366320|\n", + "| 445|366371|\n", + "| 444|366383|\n", + "| 443|366422|\n", + "| 442|366461|\n", + "| 441|366589|\n", + "| 440|366617|\n", + "| 439|366758|\n", + "| 438|366799|\n", + "| 437|366883|\n", + "| 436|366901|\n", + "| 435|366940|\n", + "| 434|367122|\n", + "| 21|380513|\n", + "| 20|380565|\n", + "| 19|380749|\n", + "| 18|381028|\n", + "| 17|381069|\n", + "| 16|381243|\n", + "| 15|381263|\n", + "| 14|381438|\n", + "| 13|381470|\n", + "| 12|381544|\n", + "| 11|381646|\n", + "| 10|381711|\n", + "| 8|381721|\n", + "| 9|381753|\n", + "| 7|381759|\n", + "| 6|381763|\n", + "| 5|381783|\n", + "| 4|381827|\n", + "| 3|381971|\n", + "| 1|382022|\n", + "| 2|382029|\n", + "| 0|382095|\n", + "| 332|420259|\n", + "| 333|420346|\n", + "| 331|420485|\n", + "| 330|420525|\n", + "| 329|420707|\n", + "| 326|421031|\n", + "| 327|421040|\n", + "| 328|421052|\n", + "| 324|421107|\n", + "| 325|421142|\n", + "| 323|421374|\n", + "| 320|421440|\n", + "| 322|421479|\n", + "| 321|421531|\n", + "| 317|421574|\n", + "| 318|421603|\n", + "| 319|421610|\n", + "| 316|421612|\n", + "| 312|421670|\n", + "| 310|421675|\n", + "| 315|421679|\n", + "| 314|421681|\n", + "| 313|421687|\n", + "| 309|421687|\n", + "| 311|421699|\n", + "| 308|421751|\n", + "| 305|421832|\n", + "| 300|421867|\n", + "| 306|421897|\n", + "| 302|421903|\n", + "| 307|421911|\n", + "| 304|421918|\n", + "| 303|421920|\n", + "| 301|421950|\n", + "| 299|421972|\n", + "| 297|421988|\n", + "| 298|422019|\n", + "| 295|422072|\n", + "| 293|422083|\n", + "| 296|422091|\n", + "| 294|422095|\n", + "| 292|422097|\n", + "| 288|422103|\n", + "| 290|422114|\n", + "| 291|422116|\n", + "| 285|422134|\n", + "| 289|422155|\n", + "| 286|422155|\n", + "| 280|422185|\n", + "| 287|422193|\n", + "| 284|422194|\n", + "| 282|422207|\n", + "| 281|422218|\n", + "| 283|422236|\n", + "| 278|422238|\n", + "| 276|422255|\n", + "| 279|422265|\n", + "| 277|422266|\n", + "| 275|422305|\n", + "| 273|422307|\n", + "| 274|422346|\n", + "| 272|422350|\n", + "| 271|422354|\n", + "| 270|422372|\n", + "| 269|422415|\n", + "| 268|422498|\n", + "| 267|422501|\n", + "| 266|422508|\n", + "| 265|422549|\n", + "| 264|422557|\n", + "| 263|422591|\n", + "| 262|422625|\n", + "| 260|422634|\n", + "| 259|422671|\n", + "| 258|422673|\n", + "| 261|422692|\n", + "| 257|422694|\n", + "| 255|422761|\n", + "| 252|422777|\n", + "| 250|422788|\n", + "| 253|422795|\n", + "| 256|422803|\n", + "| 254|422807|\n", + "| 248|422838|\n", + "| 249|422839|\n", + "| 251|422841|\n", + "| 247|422852|\n", + "| 246|422891|\n", + "| 242|422904|\n", + "| 245|422925|\n", + "| 244|422986|\n", + "| 243|423003|\n", + "| 240|423197|\n", + "| 241|423202|\n", + "| 238|423231|\n", + "| 239|423262|\n", + "| 236|423376|\n", + "| 237|423402|\n", + "| 235|423403|\n", + "| 234|423762|\n", + "| 433|569570|\n", + "| 432|570154|\n", + "| 431|570301|\n", + "| 430|570372|\n", + "| 429|570572|\n", + "| 428|570655|\n", + "| 426|570763|\n", + "| 427|570781|\n", + "| 424|570870|\n", + "| 425|570872|\n", + "| 423|570953|\n", + "| 422|570979|\n", + "| 421|571069|\n", + "| 419|571096|\n", + "| 420|571097|\n", + "| 418|571127|\n", + "| 417|571153|\n", + "| 416|571185|\n", + "| 415|571201|\n", + "| 414|571286|\n", + "| 413|571425|\n", + "| 412|571449|\n", + "| 407|571506|\n", + "| 410|571528|\n", + "| 411|571532|\n", + "| 409|571553|\n", + "| 408|571585|\n", + "| 406|571595|\n", + "| 404|571645|\n", + "| 405|571657|\n", + "| 403|571742|\n", + "| 402|571766|\n", + "| 401|571796|\n", + "| 399|571842|\n", + "| 400|571847|\n", + "| 397|571874|\n", + "| 398|571913|\n", + "| 396|571925|\n", + "| 395|571966|\n", + "| 394|571983|\n", + "| 392|571993|\n", + "| 393|572020|\n", + "| 391|572123|\n", + "| 387|572181|\n", + "| 390|572182|\n", + "| 389|572183|\n", + "| 388|572189|\n", + "| 386|572212|\n", + "| 385|572244|\n", + "| 383|572249|\n", + "| 384|572276|\n", + "| 382|572302|\n", + "| 381|572344|\n", + "| 380|572361|\n", + "| 379|572382|\n", + "| 378|572394|\n", + "| 377|572428|\n", + "| 376|572438|\n", + "| 375|572493|\n", + "| 373|572545|\n", + "| 374|572565|\n", + "| 372|572569|\n", + "| 371|572600|\n", + "| 370|572604|\n", + "| 369|572634|\n", + "| 368|572647|\n", + "| 366|572742|\n", + "| 365|572742|\n", + "| 367|572755|\n", + "| 364|572798|\n", + "| 362|572800|\n", + "| 363|572816|\n", + "| 361|572868|\n", + "| 360|572895|\n", + "| 359|572907|\n", + "| 358|572924|\n", + "| 357|572957|\n", + "| 356|573022|\n", + "| 354|573102|\n", + "| 355|573104|\n", + "| 352|573132|\n", + "| 353|573145|\n", + "| 351|573173|\n", + "| 350|573187|\n", + "| 348|573262|\n", + "| 349|573270|\n", + "| 347|573334|\n", + "| 346|573372|\n", + "| 345|573425|\n", + "| 344|573556|\n", + "| 343|573584|\n", + "| 342|573658|\n", + "| 341|573676|\n", + "| 340|573781|\n", + "| 339|573977|\n", + "| 337|574013|\n", + "| 338|574040|\n", + "| 336|574185|\n", + "| 335|574318|\n", + "| 334|574727|\n", + "| 32|610310|\n", + "| 31|610736|\n", + "| 30|610951|\n", + "| 29|611294|\n", + "| 28|611761|\n", + "| 27|611933|\n", + "| 26|612048|\n", + "| 25|612117|\n", + "| 24|612529|\n", + "| 23|613100|\n", + "| 22|613957|\n", + "+-----------+------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "#display number of records by partition\n", + "def displaypartitions(df):\n", + " #number of records by partition\n", + " num = df.rdd.getNumPartitions()\n", + " print(\"Partitions:\", num)\n", + " df.withColumn(\"partitionId\", F.spark_partition_id())\\\n", + " .groupBy(\"partitionId\")\\\n", + " .count()\\\n", + " .orderBy(F.asc(\"count\"))\\\n", + " .show(num)\n", + "\n", + "df_all.rdd.getNumPartitions()\n", + "displaypartitions(df_all)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "abf8091a-9662-4378-8fe5-b2ece46a6a14", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 34:=====================================================>(533 + 1) / 534]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Partitions: 600\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 37:====================================================> (584 + 8) / 600]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+------+\n", + "|partitionId| count|\n", + "+-----------+------+\n", + "| 263|362150|\n", + "| 258|362151|\n", + "| 265|362151|\n", + "| 256|362151|\n", + "| 259|362151|\n", + "| 255|362152|\n", + "| 267|362152|\n", + "| 266|362152|\n", + "| 257|362152|\n", + "| 262|362152|\n", + "| 260|362152|\n", + "| 264|362152|\n", + "| 254|362153|\n", + "| 261|362153|\n", + "| 181|362154|\n", + "| 268|362154|\n", + "| 179|362154|\n", + "| 188|362154|\n", + "| 180|362154|\n", + "| 250|362155|\n", + "| 183|362155|\n", + "| 269|362155|\n", + "| 272|362155|\n", + "| 189|362155|\n", + "| 251|362155|\n", + "| 186|362155|\n", + "| 253|362155|\n", + "| 252|362155|\n", + "| 172|362155|\n", + "| 173|362155|\n", + "| 178|362155|\n", + "| 182|362155|\n", + "| 273|362155|\n", + "| 184|362155|\n", + "| 187|362155|\n", + "| 277|362156|\n", + "| 285|362156|\n", + "| 232|362156|\n", + "| 245|362156|\n", + "| 190|362156|\n", + "| 219|362156|\n", + "| 554|362156|\n", + "| 287|362156|\n", + "| 177|362156|\n", + "| 286|362156|\n", + "| 270|362156|\n", + "| 276|362156|\n", + "| 175|362156|\n", + "| 557|362156|\n", + "| 274|362156|\n", + "| 275|362156|\n", + "| 174|362156|\n", + "| 241|362156|\n", + "| 243|362156|\n", + "| 278|362156|\n", + "| 223|362156|\n", + "| 290|362156|\n", + "| 238|362156|\n", + "| 185|362156|\n", + "| 291|362156|\n", + "| 239|362157|\n", + "| 222|362157|\n", + "| 559|362157|\n", + "| 249|362157|\n", + "| 288|362157|\n", + "| 233|362157|\n", + "| 237|362157|\n", + "| 240|362157|\n", + "| 289|362157|\n", + "| 246|362157|\n", + "| 558|362157|\n", + "| 224|362157|\n", + "| 221|362157|\n", + "| 556|362157|\n", + "| 11|362157|\n", + "| 271|362157|\n", + "| 231|362157|\n", + "| 248|362157|\n", + "| 280|362157|\n", + "| 555|362157|\n", + "| 247|362157|\n", + "| 162|362157|\n", + "| 292|362157|\n", + "| 176|362157|\n", + "| 565|362157|\n", + "| 279|362158|\n", + "| 163|362158|\n", + "| 532|362158|\n", + "| 166|362158|\n", + "| 283|362158|\n", + "| 293|362158|\n", + "| 225|362158|\n", + "| 167|362158|\n", + "| 191|362158|\n", + "| 560|362158|\n", + "| 553|362158|\n", + "| 599|362158|\n", + "| 226|362158|\n", + "| 165|362158|\n", + "| 534|362158|\n", + "| 564|362158|\n", + "| 561|362158|\n", + "| 242|362158|\n", + "| 531|362158|\n", + "| 566|362158|\n", + "| 218|362158|\n", + "| 281|362158|\n", + "| 236|362158|\n", + "| 282|362158|\n", + "| 0|362158|\n", + "| 168|362158|\n", + "| 220|362158|\n", + "| 161|362158|\n", + "| 244|362158|\n", + "| 234|362158|\n", + "| 164|362158|\n", + "| 535|362158|\n", + "| 550|362158|\n", + "| 171|362158|\n", + "| 230|362158|\n", + "| 563|362158|\n", + "| 533|362158|\n", + "| 157|362159|\n", + "| 597|362159|\n", + "| 598|362159|\n", + "| 1|362159|\n", + "| 551|362159|\n", + "| 549|362159|\n", + "| 209|362159|\n", + "| 210|362159|\n", + "| 569|362159|\n", + "| 216|362159|\n", + "| 235|362159|\n", + "| 5|362159|\n", + "| 160|362159|\n", + "| 490|362159|\n", + "| 227|362159|\n", + "| 488|362159|\n", + "| 228|362159|\n", + "| 489|362159|\n", + "| 536|362159|\n", + "| 204|362159|\n", + "| 192|362159|\n", + "| 159|362159|\n", + "| 567|362159|\n", + "| 552|362159|\n", + "| 284|362159|\n", + "| 2|362159|\n", + "| 13|362159|\n", + "| 294|362159|\n", + "| 211|362159|\n", + "| 215|362159|\n", + "| 562|362159|\n", + "| 6|362159|\n", + "| 12|362159|\n", + "| 530|362159|\n", + "| 568|362159|\n", + "| 7|362159|\n", + "| 3|362159|\n", + "| 229|362159|\n", + "| 537|362159|\n", + "| 169|362159|\n", + "| 217|362159|\n", + "| 208|362159|\n", + "| 205|362160|\n", + "| 538|362160|\n", + "| 158|362160|\n", + "| 197|362160|\n", + "| 8|362160|\n", + "| 194|362160|\n", + "| 10|362160|\n", + "| 491|362160|\n", + "| 591|362160|\n", + "| 156|362160|\n", + "| 526|362160|\n", + "| 539|362160|\n", + "| 214|362160|\n", + "| 492|362160|\n", + "| 207|362160|\n", + "| 523|362160|\n", + "| 170|362160|\n", + "| 206|362160|\n", + "| 525|362160|\n", + "| 212|362160|\n", + "| 571|362160|\n", + "| 196|362160|\n", + "| 193|362160|\n", + "| 572|362160|\n", + "| 4|362160|\n", + "| 296|362160|\n", + "| 547|362160|\n", + "| 295|362160|\n", + "| 570|362160|\n", + "| 486|362161|\n", + "| 155|362161|\n", + "| 589|362161|\n", + "| 403|362161|\n", + "| 404|362161|\n", + "| 510|362161|\n", + "| 545|362161|\n", + "| 25|362161|\n", + "| 203|362161|\n", + "| 515|362161|\n", + "| 590|362161|\n", + "| 300|362161|\n", + "| 213|362161|\n", + "| 512|362161|\n", + "| 387|362161|\n", + "| 405|362161|\n", + "| 529|362161|\n", + "| 514|362161|\n", + "| 307|362161|\n", + "| 493|362161|\n", + "| 543|362161|\n", + "| 509|362161|\n", + "| 527|362161|\n", + "| 544|362161|\n", + "| 596|362161|\n", + "| 511|362161|\n", + "| 15|362161|\n", + "| 593|362161|\n", + "| 595|362161|\n", + "| 151|362161|\n", + "| 195|362161|\n", + "| 592|362161|\n", + "| 487|362161|\n", + "| 383|362161|\n", + "| 152|362161|\n", + "| 298|362161|\n", + "| 105|362161|\n", + "| 508|362161|\n", + "| 202|362161|\n", + "| 297|362161|\n", + "| 9|362161|\n", + "| 522|362161|\n", + "| 199|362161|\n", + "| 540|362161|\n", + "| 506|362161|\n", + "| 594|362161|\n", + "| 573|362161|\n", + "| 153|362161|\n", + "| 407|362161|\n", + "| 517|362161|\n", + "| 513|362161|\n", + "| 441|362161|\n", + "| 198|362161|\n", + "| 14|362161|\n", + "| 548|362161|\n", + "| 101|362161|\n", + "| 306|362161|\n", + "| 516|362162|\n", + "| 312|362162|\n", + "| 582|362162|\n", + "| 576|362162|\n", + "| 497|362162|\n", + "| 299|362162|\n", + "| 390|362162|\n", + "| 26|362162|\n", + "| 580|362162|\n", + "| 581|362162|\n", + "| 435|362162|\n", + "| 541|362162|\n", + "| 110|362162|\n", + "| 106|362162|\n", + "| 100|362162|\n", + "| 301|362162|\n", + "| 500|362162|\n", + "| 442|362162|\n", + "| 308|362162|\n", + "| 546|362162|\n", + "| 389|362162|\n", + "| 431|362162|\n", + "| 432|362162|\n", + "| 438|362162|\n", + "| 129|362162|\n", + "| 507|362162|\n", + "| 408|362162|\n", + "| 528|362162|\n", + "| 112|362162|\n", + "| 111|362162|\n", + "| 519|362162|\n", + "| 27|362162|\n", + "| 433|362162|\n", + "| 64|362162|\n", + "| 496|362162|\n", + "| 382|362162|\n", + "| 200|362162|\n", + "| 109|362162|\n", + "| 378|362162|\n", + "| 574|362162|\n", + "| 406|362162|\n", + "| 107|362162|\n", + "| 17|362162|\n", + "| 385|362162|\n", + "| 201|362162|\n", + "| 440|362162|\n", + "| 583|362162|\n", + "| 16|362162|\n", + "| 495|362162|\n", + "| 114|362162|\n", + "| 104|362162|\n", + "| 384|362162|\n", + "| 388|362162|\n", + "| 439|362162|\n", + "| 434|362162|\n", + "| 309|362162|\n", + "| 313|362162|\n", + "| 302|362162|\n", + "| 524|362162|\n", + "| 108|362162|\n", + "| 386|362162|\n", + "| 394|362163|\n", + "| 24|362163|\n", + "| 398|362163|\n", + "| 85|362163|\n", + "| 314|362163|\n", + "| 400|362163|\n", + "| 28|362163|\n", + "| 102|362163|\n", + "| 436|362163|\n", + "| 372|362163|\n", + "| 113|362163|\n", + "| 542|362163|\n", + "| 504|362163|\n", + "| 412|362163|\n", + "| 65|362163|\n", + "| 505|362163|\n", + "| 373|362163|\n", + "| 499|362163|\n", + "| 503|362163|\n", + "| 131|362163|\n", + "| 381|362163|\n", + "| 498|362163|\n", + "| 86|362163|\n", + "| 399|362163|\n", + "| 494|362163|\n", + "| 18|362163|\n", + "| 521|362163|\n", + "| 401|362163|\n", + "| 89|362163|\n", + "| 375|362163|\n", + "| 311|362163|\n", + "| 586|362163|\n", + "| 585|362163|\n", + "| 154|362163|\n", + "| 402|362163|\n", + "| 94|362163|\n", + "| 128|362163|\n", + "| 395|362163|\n", + "| 518|362163|\n", + "| 70|362163|\n", + "| 579|362163|\n", + "| 501|362163|\n", + "| 304|362163|\n", + "| 575|362163|\n", + "| 502|362163|\n", + "| 127|362163|\n", + "| 71|362163|\n", + "| 379|362163|\n", + "| 587|362163|\n", + "| 103|362163|\n", + "| 437|362163|\n", + "| 584|362163|\n", + "| 130|362163|\n", + "| 305|362163|\n", + "| 115|362163|\n", + "| 588|362163|\n", + "| 520|362163|\n", + "| 409|362163|\n", + "| 377|362163|\n", + "| 96|362164|\n", + "| 391|362164|\n", + "| 359|362164|\n", + "| 397|362164|\n", + "| 145|362164|\n", + "| 149|362164|\n", + "| 410|362164|\n", + "| 310|362164|\n", + "| 87|362164|\n", + "| 133|362164|\n", + "| 22|362164|\n", + "| 77|362164|\n", + "| 19|362164|\n", + "| 360|362164|\n", + "| 471|362164|\n", + "| 411|362164|\n", + "| 29|362164|\n", + "| 371|362164|\n", + "| 90|362164|\n", + "| 392|362164|\n", + "| 376|362164|\n", + "| 444|362164|\n", + "| 132|362164|\n", + "| 374|362164|\n", + "| 83|362164|\n", + "| 72|362164|\n", + "| 88|362164|\n", + "| 363|362164|\n", + "| 466|362164|\n", + "| 66|362164|\n", + "| 67|362164|\n", + "| 449|362164|\n", + "| 144|362164|\n", + "| 20|362164|\n", + "| 353|362164|\n", + "| 443|362164|\n", + "| 315|362164|\n", + "| 393|362164|\n", + "| 362|362164|\n", + "| 578|362164|\n", + "| 69|362164|\n", + "| 467|362164|\n", + "| 63|362164|\n", + "| 396|362164|\n", + "| 143|362164|\n", + "| 577|362164|\n", + "| 303|362164|\n", + "| 116|362164|\n", + "| 23|362164|\n", + "| 380|362164|\n", + "| 93|362165|\n", + "| 413|362165|\n", + "| 30|362165|\n", + "| 448|362165|\n", + "| 361|362165|\n", + "| 364|362165|\n", + "| 464|362165|\n", + "| 455|362165|\n", + "| 473|362165|\n", + "| 316|362165|\n", + "| 451|362165|\n", + "| 146|362165|\n", + "| 150|362165|\n", + "| 68|362165|\n", + "| 454|362165|\n", + "| 123|362165|\n", + "| 348|362165|\n", + "| 124|362165|\n", + "| 99|362165|\n", + "| 134|362165|\n", + "| 126|362165|\n", + "| 429|362165|\n", + "| 84|362165|\n", + "| 91|362165|\n", + "| 97|362165|\n", + "| 82|362165|\n", + "| 76|362165|\n", + "| 92|362165|\n", + "| 21|362165|\n", + "| 140|362165|\n", + "| 75|362165|\n", + "| 61|362165|\n", + "| 354|362165|\n", + "| 370|362165|\n", + "| 147|362165|\n", + "| 79|362165|\n", + "| 118|362165|\n", + "| 475|362165|\n", + "| 468|362165|\n", + "| 142|362165|\n", + "| 136|362165|\n", + "| 120|362165|\n", + "| 430|362165|\n", + "| 125|362165|\n", + "| 95|362165|\n", + "| 352|362165|\n", + "| 135|362165|\n", + "| 445|362165|\n", + "| 78|362165|\n", + "| 73|362166|\n", + "| 416|362166|\n", + "| 81|362166|\n", + "| 369|362166|\n", + "| 456|362166|\n", + "| 55|362166|\n", + "| 350|362166|\n", + "| 470|362166|\n", + "| 428|362166|\n", + "| 137|362166|\n", + "| 427|362166|\n", + "| 80|362166|\n", + "| 148|362166|\n", + "| 32|362166|\n", + "| 117|362166|\n", + "| 141|362166|\n", + "| 414|362166|\n", + "| 469|362166|\n", + "| 347|362166|\n", + "| 56|362166|\n", + "| 366|362166|\n", + "| 358|362166|\n", + "| 368|362166|\n", + "| 365|362166|\n", + "| 450|362166|\n", + "| 474|362166|\n", + "| 138|362166|\n", + "| 74|362166|\n", + "| 59|362166|\n", + "| 317|362166|\n", + "| 446|362166|\n", + "| 355|362166|\n", + "| 465|362166|\n", + "| 459|362166|\n", + "| 62|362166|\n", + "| 351|362166|\n", + "| 477|362166|\n", + "| 476|362166|\n", + "| 457|362166|\n", + "| 447|362166|\n", + "| 426|362166|\n", + "| 461|362166|\n", + "| 460|362166|\n", + "| 472|362166|\n", + "| 462|362166|\n", + "| 31|362166|\n", + "| 122|362166|\n", + "| 453|362166|\n", + "| 57|362166|\n", + "| 121|362166|\n", + "| 98|362166|\n", + "| 33|362167|\n", + "| 483|362167|\n", + "| 54|362167|\n", + "| 479|362167|\n", + "| 452|362167|\n", + "| 356|362167|\n", + "| 463|362167|\n", + "| 53|362167|\n", + "| 60|362167|\n", + "| 484|362167|\n", + "| 478|362167|\n", + "| 58|362167|\n", + "| 52|362167|\n", + "| 119|362167|\n", + "| 357|362167|\n", + "| 422|362167|\n", + "| 420|362167|\n", + "| 318|362167|\n", + "| 139|362167|\n", + "| 419|362167|\n", + "| 367|362167|\n", + "| 415|362167|\n", + "| 458|362167|\n", + "| 425|362167|\n", + "| 319|362167|\n", + "| 349|362167|\n", + "| 418|362167|\n", + "| 328|362167|\n", + "| 485|362168|\n", + "| 480|362168|\n", + "| 421|362168|\n", + "| 329|362168|\n", + "| 423|362168|\n", + "| 326|362168|\n", + "| 481|362168|\n", + "| 424|362168|\n", + "| 417|362168|\n", + "| 51|362168|\n", + "| 331|362168|\n", + "| 482|362168|\n", + "| 322|362168|\n", + "| 320|362168|\n", + "| 324|362168|\n", + "| 321|362168|\n", + "| 323|362168|\n", + "| 37|362169|\n", + "| 330|362169|\n", + "| 50|362169|\n", + "| 346|362169|\n", + "| 38|362169|\n", + "| 325|362169|\n", + "| 35|362169|\n", + "| 34|362169|\n", + "| 327|362169|\n", + "| 332|362170|\n", + "| 336|362170|\n", + "| 345|362170|\n", + "| 334|362170|\n", + "| 40|362170|\n", + "| 48|362170|\n", + "| 39|362170|\n", + "| 333|362170|\n", + "| 47|362171|\n", + "| 44|362171|\n", + "| 49|362171|\n", + "| 43|362171|\n", + "| 338|362171|\n", + "| 337|362171|\n", + "| 41|362171|\n", + "| 335|362171|\n", + "| 42|362171|\n", + "| 46|362171|\n", + "| 36|362171|\n", + "| 343|362172|\n", + "| 339|362172|\n", + "| 45|362172|\n", + "| 340|362173|\n", + "| 341|362173|\n", + "| 342|362173|\n", + "| 344|362173|\n", + "+-----------+------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "# repartitioning to 600 partitions, seems to be balanced now. \n", + "df_all = df_all.repartition(600)\n", + "displaypartitions(df_all)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "9c7c7fa9-7a39-46eb-93fd-c7006d01c03e", "metadata": {}, "outputs": [], "source": [ - "# TODO: REPARTITION\n", - "\n", "# we will need a year column in this model\n", "df_all = df_all.withColumn('year', F.year(df_all.start_timestamp))" ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fae5317c-df84-47ae-a003-440c01c25d07", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- ID: string (nullable = true)\n", + " |-- start_timestamp: timestamp (nullable = true)\n", + " |-- end_timestamp: timestamp (nullable = true)\n", + " |-- seconds: integer (nullable = true)\n", + " |-- miles: double (nullable = true)\n", + " |-- pickup_tract: long (nullable = true)\n", + " |-- dropoff_tract: long (nullable = true)\n", + " |-- pickup_area: integer (nullable = true)\n", + " |-- dropoff_area: integer (nullable = true)\n", + " |-- Fare: double (nullable = true)\n", + " |-- Tip: integer (nullable = true)\n", + " |-- total: double (nullable = true)\n", + " |-- pickup_lat: double (nullable = true)\n", + " |-- pickup_lon: double (nullable = true)\n", + " |-- dropoff_lat: double (nullable = true)\n", + " |-- dropoff_lon: string (nullable = true)\n", + " |-- month: integer (nullable = true)\n", + " |-- day_of_month: integer (nullable = true)\n", + " |-- hour: integer (nullable = true)\n", + " |-- day: integer (nullable = true)\n", + " |-- year: integer (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "df_all.printSchema()" + ] + }, { "cell_type": "markdown", "id": "59c15480-30c0-48c8-b9a5-6aa122ff1325", "metadata": {}, "source": [ - "## Notes for Harsh:\n", + "## Next steps\n", "\n", "I'm assuming we are predicting using the full dataset and not restricting ourselves to being within the program hours.\n", "\n", @@ -210,62 +1503,357 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 20, "id": "540c7bff-6eac-40c2-a9fe-9b6843f7d546", "metadata": {}, + "outputs": [], + "source": [ + "# take a sample to test these operations out on first\n", + "sample_df = df_all.sample(fraction=1/1000000)\n", + "\n", + "# get only the columns needed for the model\n", + "selected_columns = [\"pickup_area\",\"dropoff_area\",\"day\",\"month\",\"year\",\"ID\"]\n", + "sample_selected = sample_df.select(selected_columns)\n", + "\n", + "\n", + "# group the rideshare data by day and community area and create counts\n", + "#sample_df = sample_df.groupby('day',\"month\",\"year\",'pickup_area','dropoff_area').agg({'ID':'count'})\n", + "#sample_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "3ec0caae-bb26-42a5-86ae-93710741c4e8", + "metadata": {}, + "outputs": [], + "source": [ + "# grouping by community area getting daily counts - one where the community is the pickup area\n", + "pickup_counts = sample_df.groupby('day', 'month', 'year', 'pickup_area').agg({'ID':'count'})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "855586ae-d82a-4cc9-97f2-2d1b0a3d5c0d", + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[Stage 26:=====================================================>(433 + 1) / 434]\r" + " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "+---+-----+----+-----------+------------+---------+\n", - "|day|month|year|pickup_area|dropoff_area|count(ID)|\n", - "+---+-----+----+-----------+------------+---------+\n", - "| 4| 12|2018| 7| 24| 1|\n", - "| 5| 1|2019| 24| 8| 1|\n", - "| 3| 8|2019| 28| 8| 1|\n", - "| 2| 3|2021| 28| 8| 1|\n", - "| 6| 4|2021| 28| 28| 1|\n", - "+---+-----+----+-----------+------------+---------+\n", - "only showing top 5 rows\n", + "+---------+\n", + "|count(ID)|\n", + "+---------+\n", + "| 1|\n", + "| 2|\n", + "+---------+\n", + "\n" + ] + } + ], + "source": [ + "pickup_counts.select('count(ID)').distinct().show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "40183d3d-13da-4f01-aa6b-d6b47426b5e8", + "metadata": {}, + "outputs": [], + "source": [ + "dropoff_counts = sample_df.groupby('day', 'month', 'year', 'dropoff_area').agg({'ID':'count'})" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "20a6062f-a22b-461a-8e07-f83ff42f4a19", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+\n", + "|count(ID)|\n", + "+---------+\n", + "| 1|\n", + "| 2|\n", + "+---------+\n", "\n" ] + } + ], + "source": [ + "dropoff_counts.select('count(ID)').distinct().show(5)" + ] + }, + { + "cell_type": "markdown", + "id": "bdccfb9c-6f47-4d6a-a42b-ad7a1f707f4a", + "metadata": {}, + "source": [ + "**Daily counts for each community area**" + ] + }, + { + "cell_type": "markdown", + "id": "d23b5405-6491-43ed-90fb-cf7aa2056553", + "metadata": {}, + "source": [ + "we had to group by pickup area and dropoff area seperately- daily counts of number of trips to that particular community area when it was either a pickup or dropoff area" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "14b0b19a-36ad-4e17-bf30-1e9fcdaea452", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate daily counts for pickup areas\n", + "pickup_counts = sample_df.groupby('day', 'month', 'year', 'pickup_area').count().withColumnRenamed('count', 'pickup_count')\n", + "pickup_counts = pickup_counts.withColumnRenamed('pickup_area', 'area')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "388446e2-2cd4-4bb8-bfa0-f204b1359427", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- day: integer (nullable = true)\n", + " |-- month: integer (nullable = true)\n", + " |-- year: integer (nullable = true)\n", + " |-- area: integer (nullable = true)\n", + " |-- dropoff_count: long (nullable = false)\n", + "\n" + ] + } + ], + "source": [ + "# Calculate daily counts for dropoff areas\n", + "dropoff_counts = sample_df.groupby('day', 'month', 'year', 'dropoff_area').count().withColumnRenamed('count', 'dropoff_count')\n", + "dropoff_counts = dropoff_counts.withColumnRenamed('dropoff_area', 'area')\n", + "#dropoff_counts.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "3e028095-2ff1-4e5f-aa03-7dafe417168e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+----+----+-----------------+------------------+\n", + "|day|month|year|area|sum(pickup_count)|sum(dropoff_count)|\n", + "+---+-----+----+----+-----------------+------------------+\n", + "| 6| 4|2019| 76| 0| 1|\n", + "| 2| 6|2022| 49| 1| 0|\n", + "| 1| 11|2018| 7| 1| 0|\n", + "| 6| 4|2019| 22| 0| 1|\n", + "| 7| 3|2021| 7| 0| 1|\n", + "| 5| 9|2022| 32| 0| 1|\n", + "| 6| 5|2022| 6| 1| 0|\n", + "| 3| 9|2021| 33| 0| 1|\n", + "| 7| 2|2021| 77| 0| 1|\n", + "| 2| 7|2019| 29| 1| 0|\n", + "+---+-----+----+----+-----------------+------------------+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], + "source": [ + "pickup_counts = pickup_counts.withColumn('dropoff_count', F.lit(0))\n", + "dropoff_counts = dropoff_counts.withColumn('pickup_count', F.lit(0))\n", + "\n", + "# ensuring same column order\n", + "pickup_counts = pickup_counts.select('day', 'month', 'year', 'area', 'pickup_count', 'dropoff_count')\n", + "dropoff_counts = dropoff_counts.select('day', 'month', 'year', 'area', 'pickup_count', 'dropoff_count')\n", + "\n", + "# Union the pickup and dropoff dataframes\n", + "combined_df = pickup_counts.union(dropoff_counts)\n", + "\n", + "# Group by day, month, year, and area, summing up the counts\n", + "daily_counts_by_area = combined_df.groupby('day', 'month', 'year', 'area').sum('pickup_count', 'dropoff_count')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e987e9e8-0f79-46b4-b0e5-002bc38cf242", + "metadata": {}, + "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+----+----+-----------------+------------------+------------+\n", + "|day|month|year|area|sum(pickup_count)|sum(dropoff_count)|total_counts|\n", + "+---+-----+----+----+-----------------+------------------+------------+\n", + "| 2| 5|2023| 29| 1| 0| 1|\n", + "| 7| 2|2019| 28| 1| 0| 1|\n", + "| 3| 9|2021| 32| 1| 0| 1|\n", + "| 5| 1|2022| 38| 0| 1| 1|\n", + "| 2| 6|2019| 28| 1| 0| 1|\n", + "| 3| 4|2021| 32| 1| 0| 1|\n", + "| 3| 1|2023| 4| 1| 0| 1|\n", + "| 3| 11|2018| 15| 0| 1| 1|\n", + "| 3| 3|2021| 6| 1| 0| 1|\n", + "| 7| 7|2021| 43| 1| 0| 1|\n", + "+---+-----+----+----+-----------------+------------------+------------+\n", + "only showing top 10 rows\n", + "\n" + ] } ], "source": [ - "# take a sample to test these operations out on first\n", - "sample_df = df_all.sample(fraction=1/1000000)\n", + "daily_counts_by_area = daily_counts_by_area.withColumn('total_counts', F.col('sum(pickup_count)') + F.col('sum(dropoff_count)'))\n", + "# the relatively smaller numbers are mostly a result of the sample size, should be fine when we \n", + "# make it to the entire dataframe\n", + "daily_counts_by_area.show(10)" + ] + }, + { + "cell_type": "markdown", + "id": "6b68b7b0-9db1-47c2-a896-f1875023ed66", + "metadata": {}, + "source": [ + "Pivoting the dataset for community areas" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "d57f2088-9ca6-4cc8-a8fe-0cca835fecf9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+----+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+\n", + "|day|month|year|1_sum(Hyde_Park)|1_sum(Kenwood)|1_sum(Woodlawn)|2_sum(Hyde_Park)|2_sum(Kenwood)|2_sum(Woodlawn)|3_sum(Hyde_Park)|3_sum(Kenwood)|3_sum(Woodlawn)|4_sum(Hyde_Park)|4_sum(Kenwood)|4_sum(Woodlawn)|5_sum(Hyde_Park)|5_sum(Kenwood)|5_sum(Woodlawn)|6_sum(Hyde_Park)|6_sum(Kenwood)|6_sum(Woodlawn)|7_sum(Hyde_Park)|7_sum(Kenwood)|7_sum(Woodlawn)|8_sum(Hyde_Park)|8_sum(Kenwood)|8_sum(Woodlawn)|10_sum(Hyde_Park)|10_sum(Kenwood)|10_sum(Woodlawn)|12_sum(Hyde_Park)|12_sum(Kenwood)|12_sum(Woodlawn)|15_sum(Hyde_Park)|15_sum(Kenwood)|15_sum(Woodlawn)|16_sum(Hyde_Park)|16_sum(Kenwood)|16_sum(Woodlawn)|17_sum(Hyde_Park)|17_sum(Kenwood)|17_sum(Woodlawn)|19_sum(Hyde_Park)|19_sum(Kenwood)|19_sum(Woodlawn)|21_sum(Hyde_Park)|21_sum(Kenwood)|21_sum(Woodlawn)|22_sum(Hyde_Park)|22_sum(Kenwood)|22_sum(Woodlawn)|23_sum(Hyde_Park)|23_sum(Kenwood)|23_sum(Woodlawn)|24_sum(Hyde_Park)|24_sum(Kenwood)|24_sum(Woodlawn)|25_sum(Hyde_Park)|25_sum(Kenwood)|25_sum(Woodlawn)|27_sum(Hyde_Park)|27_sum(Kenwood)|27_sum(Woodlawn)|28_sum(Hyde_Park)|28_sum(Kenwood)|28_sum(Woodlawn)|29_sum(Hyde_Park)|29_sum(Kenwood)|29_sum(Woodlawn)|30_sum(Hyde_Park)|30_sum(Kenwood)|30_sum(Woodlawn)|31_sum(Hyde_Park)|31_sum(Kenwood)|31_sum(Woodlawn)|32_sum(Hyde_Park)|32_sum(Kenwood)|32_sum(Woodlawn)|33_sum(Hyde_Park)|33_sum(Kenwood)|33_sum(Woodlawn)|34_sum(Hyde_Park)|34_sum(Kenwood)|34_sum(Woodlawn)|36_sum(Hyde_Park)|36_sum(Kenwood)|36_sum(Woodlawn)|38_sum(Hyde_Park)|38_sum(Kenwood)|38_sum(Woodlawn)|39_sum(Hyde_Park)|39_sum(Kenwood)|39_sum(Woodlawn)|40_sum(Hyde_Park)|40_sum(Kenwood)|40_sum(Woodlawn)|41_sum(Hyde_Park)|41_sum(Kenwood)|41_sum(Woodlawn)|42_sum(Hyde_Park)|42_sum(Kenwood)|42_sum(Woodlawn)|43_sum(Hyde_Park)|43_sum(Kenwood)|43_sum(Woodlawn)|44_sum(Hyde_Park)|44_sum(Kenwood)|44_sum(Woodlawn)|46_sum(Hyde_Park)|46_sum(Kenwood)|46_sum(Woodlawn)|50_sum(Hyde_Park)|50_sum(Kenwood)|50_sum(Woodlawn)|54_sum(Hyde_Park)|54_sum(Kenwood)|54_sum(Woodlawn)|58_sum(Hyde_Park)|58_sum(Kenwood)|58_sum(Woodlawn)|61_sum(Hyde_Park)|61_sum(Kenwood)|61_sum(Woodlawn)|63_sum(Hyde_Park)|63_sum(Kenwood)|63_sum(Woodlawn)|66_sum(Hyde_Park)|66_sum(Kenwood)|66_sum(Woodlawn)|69_sum(Hyde_Park)|69_sum(Kenwood)|69_sum(Woodlawn)|71_sum(Hyde_Park)|71_sum(Kenwood)|71_sum(Woodlawn)|74_sum(Hyde_Park)|74_sum(Kenwood)|74_sum(Woodlawn)|75_sum(Hyde_Park)|75_sum(Kenwood)|75_sum(Woodlawn)|76_sum(Hyde_Park)|76_sum(Kenwood)|76_sum(Woodlawn)|77_sum(Hyde_Park)|77_sum(Kenwood)|77_sum(Woodlawn)|\n", + "+---+-----+----+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+\n", + "| 2| 7|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 6| 1|2022| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null|\n", + "| 7| 1|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 3| 5|2023| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 1| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 1| 8|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null|\n", + "| 5| 5|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 6| 6|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 5| 7|2019| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 7| 8|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 4| 2|2023| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 3| 11|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 2| 2|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 7| 5|2022| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 5| 1|2023| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 6| 4|2019| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 6| 2|2019| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 1| 10|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 3| 1|2022| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 2| 6|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "| 7| 9|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", + "+---+-----+----+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+----------------+--------------+---------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-----------------+---------------+----------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "## I'm doing something wrong here- double check. \n", + "# Area identifiers\n", + "hyde_park_id = 41\n", + "kenwood_id = 39\n", + "woodlawn_id = 42\n", "\n", - "# get only the columns needed for the model\n", - "selected_columns = [\"pickup_area\",\"dropoff_area\",\"day\",\"month\",\"year\",\"ID\"]\n", - "sample_selected = sample_df.select(selected_columns)\n", + "# Adding binary columns for each area\n", + "sample_df = sample_df.withColumn('Hyde_Park', F.when((F.col('pickup_area') == hyde_park_id) | (F.col('dropoff_area') == hyde_park_id), 1).otherwise(0))\n", + "sample_df = sample_df.withColumn('Kenwood', F.when((F.col('pickup_area') == kenwood_id) | (F.col('dropoff_area') == kenwood_id), 1).otherwise(0))\n", + "sample_df = sample_df.withColumn('Woodlawn', F.when((F.col('pickup_area') == woodlawn_id) | (F.col('dropoff_area') == woodlawn_id), 1).otherwise(0))\n", "\n", + "# Pivot the DataFrame\n", + "pivoted_df = sample_df.groupBy(\"day\", \"month\", \"year\").pivot(\"pickup_area\").sum(\"Hyde_Park\", \"Kenwood\", \"Woodlawn\")\n", "\n", - "# group the rideshare data by day and community area and create counts\n", - "sample_df = sample_df.groupby('day',\"month\",\"year\",'pickup_area','dropoff_area').agg({'ID':'count'})\n", - "sample_df.show(5)" + "# Show the results\n", + "pivoted_df.show()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "3a65e426-7aef-4891-942d-538025cd845e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "ename": "AnalysisException", + "evalue": "Cannot resolve column name \"count\" among (ID, start_timestamp, end_timestamp, seconds, miles, pickup_tract, dropoff_tract, pickup_area, dropoff_area, Fare, Tip, total, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, month, day_of_month, hour, day, year, Hyde_Park, Kenwood, Woodlawn)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[44], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# the output of the sample df above looks off. investigate\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# pivot so that each area is a column\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# should probably create a new variable that denotes in program rides, and figure out what combination of pickup or dropoff area we want to u\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m pivoted_df \u001b[38;5;241m=\u001b[39m \u001b[43msample_df\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupBy\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mday\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmonth\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43myear\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdropoff_area\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcount\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfirst\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/group.py:114\u001b[0m, in \u001b[0;36mGroupedData.agg\u001b[0;34m(self, *exprs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m exprs, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexprs should not be empty\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(exprs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exprs[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 114\u001b[0m jdf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jgd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexprs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# Columns\u001b[39;00m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(c, Column) \u001b[38;5;28;01mfor\u001b[39;00m c \u001b[38;5;129;01min\u001b[39;00m exprs), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall exprs should be Column\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py:1304\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1298\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1300\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1301\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1303\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1304\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1308\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", + "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", + "\u001b[0;31mAnalysisException\u001b[0m: Cannot resolve column name \"count\" among (ID, start_timestamp, end_timestamp, seconds, miles, pickup_tract, dropoff_tract, pickup_area, dropoff_area, Fare, Tip, total, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, month, day_of_month, hour, day, year, Hyde_Park, Kenwood, Woodlawn)" + ] + } + ], "source": [ "# the output of the sample df above looks off. investigate\n", "\n", @@ -283,6 +1871,77 @@ "source": [ "# read in weather data, merge with rideshare data" ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "b501c963-15c8-4341-b9c6-7d2f07cc5015", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- name: string (nullable = true)\n", + " |-- datetime: string (nullable = true)\n", + " |-- tempmax: double (nullable = true)\n", + " |-- tempmin: double (nullable = true)\n", + " |-- temp: double (nullable = true)\n", + " |-- feelslikemax: double (nullable = true)\n", + " |-- feelslikemin: double (nullable = true)\n", + " |-- feelslike: double (nullable = true)\n", + " |-- dew: double (nullable = true)\n", + " |-- humidity: double (nullable = true)\n", + " |-- precip: double (nullable = true)\n", + " |-- precipprob: integer (nullable = true)\n", + " |-- precipcover: double (nullable = true)\n", + " |-- preciptype: string (nullable = true)\n", + " |-- snow: double (nullable = true)\n", + " |-- snowdepth: double (nullable = true)\n", + " |-- windgust: double (nullable = true)\n", + " |-- windspeed: double (nullable = true)\n", + " |-- winddir: double (nullable = true)\n", + " |-- sealevelpressure: double (nullable = true)\n", + " |-- cloudcover: double (nullable = true)\n", + " |-- visibility: double (nullable = true)\n", + " |-- solarradiation: double (nullable = true)\n", + " |-- solarenergy: double (nullable = true)\n", + " |-- uvindex: integer (nullable = true)\n", + " |-- severerisk: string (nullable = true)\n", + " |-- sunrise: timestamp (nullable = true)\n", + " |-- sunset: timestamp (nullable = true)\n", + " |-- moonphase: double (nullable = true)\n", + " |-- conditions: string (nullable = true)\n", + " |-- description: string (nullable = true)\n", + " |-- icon: string (nullable = true)\n", + " |-- stations: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "df_weather_1 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv\", inferSchema=True, header=True)\n", + "df_weather_2 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n", + "df_weather_3 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2022-09-01 to 2022-12-31.csv\", inferSchema=True, header=True)\n", + "df_weather = df_weather_1.union(df_weather_2).union(df_weather_3)\n", + "df_weather.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2d5f67e-1122-4208-b82a-b7fc8ef7f04d", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {