diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb index 78a1715..3366a36 100644 --- a/supervised_ml.ipynb +++ b/supervised_ml.ipynb @@ -26,15 +26,13 @@ " ('spark.eventLog.enabled', 'true'),\n", " ('spark.submit.pyFiles',\n", " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.driver.port', '41723'),\n", " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", " ('spark.kryoserializer.buffer.max', '2000M'),\n", - " ('spark.driver.port', '42361'),\n", " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", " ('spark.driver.maxResultSize', '0'),\n", " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", - " 'http://hub-msca-bdp-dphub-students-harshpachisia-m:8088/proxy/application_1700673289776_0003'),\n", " ('spark.ui.filters',\n", " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", " ('spark.metrics.namespace',\n", @@ -52,19 +50,19 @@ " ('spark.yarn.am.attemptFailuresValidityInterval', '1h'),\n", " ('spark.app.name', 'Spark Updated Conf'),\n", " ('spark.sql.catalogImplementation', 'hive'),\n", - " ('spark.eventLog.dir',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/82d9ba70-b4ec-4813-be2a-b9d68f92ad04/spark-job-history'),\n", " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", " 'hub-msca-bdp-dphub-students-harshpachisia-m'),\n", " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", " ('spark.yarn.secondary.jars',\n", " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.driver.appUIAddress',\n", - " 'http://hub-msca-bdp-dphub-students-harshpachisia-m.c.msca-bdp-student-ap.internal:33471'),\n", " ('spark.repl.local.jars',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.app.startTime', '1700678618909'),\n", " ('spark.sql.cbo.enabled', 'true'),\n", + " ('spark.app.startTime', '1700688666130'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/746d6c22-b8c9-4995-8216-9ead3917af24/spark-job-history'),\n", + " ('spark.driver.appUIAddress',\n", + " 'http://hub-msca-bdp-dphub-students-harshpachisia-m.c.msca-bdp-student-ap.internal:40715'),\n", " ('spark.executorEnv.PYTHONPATH',\n", " '/usr/lib/spark/python/lib/py4j-0.10.9-src.zip:/usr/lib/spark/python/:{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar{{PWD}}/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar{{PWD}}/com.typesafe_config-1.4.2.jar{{PWD}}/org.rocksdb_rocksdbjni-6.29.5.jar{{PWD}}/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar{{PWD}}/com.github.universal-automata_liblevenshtein-3.0.0.jar{{PWD}}/com.google.cloud_google-cloud-storage-2.16.0.jar{{PWD}}/com.navigamez_greex-1.0.jar{{PWD}}/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar{{PWD}}/it.unimi.dsi_fastutil-7.0.12.jar{{PWD}}/org.projectlombok_lombok-1.16.8.jar{{PWD}}/com.google.guava_guava-31.1-jre.jar{{PWD}}/com.google.guava_failureaccess-1.0.1.jar{{PWD}}/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar{{PWD}}/com.google.errorprone_error_prone_annotations-2.16.jar{{PWD}}/com.google.j2objc_j2objc-annotations-1.3.jar{{PWD}}/com.google.http-client_google-http-client-1.42.3.jar{{PWD}}/io.opencensus_opencensus-contrib-http-util-0.31.1.jar{{PWD}}/com.google.http-client_google-http-client-jackson2-1.42.3.jar{{PWD}}/com.google.http-client_google-http-client-gson-1.42.3.jar{{PWD}}/com.google.api-client_google-api-client-2.1.1.jar{{PWD}}/commons-codec_commons-codec-1.15.jar{{PWD}}/com.google.oauth-client_google-oauth-client-1.34.1.jar{{PWD}}/com.google.http-client_google-http-client-apache-v2-1.42.3.jar{{PWD}}/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar{{PWD}}/com.google.code.gson_gson-2.10.jar{{PWD}}/com.google.cloud_google-cloud-core-2.9.0.jar{{PWD}}/com.google.auto.value_auto-value-annotations-1.10.1.jar{{PWD}}/com.google.cloud_google-cloud-core-http-2.9.0.jar{{PWD}}/com.google.http-client_google-http-client-appengine-1.42.3.jar{{PWD}}/com.google.api_gax-httpjson-0.105.1.jar{{PWD}}/com.google.cloud_google-cloud-core-grpc-2.9.0.jar{{PWD}}/io.grpc_grpc-core-1.51.0.jar{{PWD}}/com.google.api_gax-2.20.1.jar{{PWD}}/com.google.api_gax-grpc-2.20.1.jar{{PWD}}/io.grpc_grpc-alts-1.51.0.jar{{PWD}}/io.grpc_grpc-grpclb-1.51.0.jar{{PWD}}/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar{{PWD}}/io.grpc_grpc-protobuf-1.51.0.jar{{PWD}}/com.google.auth_google-auth-library-credentials-1.13.0.jar{{PWD}}/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar{{PWD}}/com.google.api_api-common-2.2.2.jar{{PWD}}/javax.annotation_javax.annotation-api-1.3.2.jar{{PWD}}/io.opencensus_opencensus-api-0.31.1.jar{{PWD}}/io.grpc_grpc-context-1.51.0.jar{{PWD}}/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar{{PWD}}/com.google.protobuf_protobuf-java-3.21.10.jar{{PWD}}/com.google.protobuf_protobuf-java-util-3.21.10.jar{{PWD}}/com.google.api.grpc_proto-google-common-protos-2.11.0.jar{{PWD}}/org.threeten_threetenbp-1.6.4.jar{{PWD}}/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.fasterxml.jackson.core_jackson-core-2.14.1.jar{{PWD}}/com.google.code.findbugs_jsr305-3.0.2.jar{{PWD}}/io.grpc_grpc-api-1.51.0.jar{{PWD}}/io.grpc_grpc-auth-1.51.0.jar{{PWD}}/io.grpc_grpc-stub-1.51.0.jar{{PWD}}/org.checkerframework_checker-qual-3.28.0.jar{{PWD}}/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar{{PWD}}/io.grpc_grpc-protobuf-lite-1.51.0.jar{{PWD}}/com.google.android_annotations-4.1.1.4.jar{{PWD}}/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar{{PWD}}/io.grpc_grpc-netty-shaded-1.51.0.jar{{PWD}}/io.perfmark_perfmark-api-0.26.0.jar{{PWD}}/io.grpc_grpc-googleapis-1.51.0.jar{{PWD}}/io.grpc_grpc-xds-1.51.0.jar{{PWD}}/io.opencensus_opencensus-proto-0.2.0.jar{{PWD}}/io.grpc_grpc-services-1.51.0.jar{{PWD}}/com.google.re2j_re2j-1.6.jar{{PWD}}/dk.brics.automaton_automaton-1.11-8.jar{{PWD}}/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.driver.host',\n", @@ -76,7 +74,6 @@ " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", " ('spark.yarn.am.memory', '640m'),\n", - " ('spark.app.id', 'application_1700673289776_0003'),\n", " ('spark.cores.max', '4'),\n", " ('spark.executor.cores', '4'),\n", " ('spark.jars.packages',\n", @@ -89,20 +86,23 @@ " ('spark.submit.deployMode', 'client'),\n", " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", " ('spark.shuffle.service.enabled', 'true'),\n", + " ('spark.app.id', 'application_1700688540052_0001'),\n", " ('spark.scheduler.mode', 'FAIR'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1700688540052_0001'),\n", " ('spark.sql.adaptive.enabled', 'true'),\n", " ('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),\n", " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", - " ('spark.history.fs.logDirectory',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/82d9ba70-b4ec-4813-be2a-b9d68f92ad04/spark-job-history'),\n", " ('spark.master', 'yarn'),\n", " ('spark.ui.port', '0'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/746d6c22-b8c9-4995-8216-9ead3917af24/spark-job-history'),\n", " ('spark.rpc.message.maxSize', '512'),\n", " ('spark.rdd.compress', 'True'),\n", " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", " 'hub-msca-bdp-dphub-students-harshpachisia-m'),\n", - " ('spark.ui.proxyBase', '/proxy/application_1700673289776_0003'),\n", " ('spark.task.maxFailures', '10'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", + " 'http://hub-msca-bdp-dphub-students-harshpachisia-m:8088/proxy/application_1700688540052_0001'),\n", " ('spark.yarn.isPython', 'true'),\n", " ('spark.dynamicAllocation.enabled', 'true'),\n", " ('spark.ui.showConsoleProgress', 'true')]" @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "8bfe115e-abb4-4a36-8508-1bd17ce2c55c", "metadata": {}, "outputs": [ @@ -1429,7 +1429,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "9c7c7fa9-7a39-46eb-93fd-c7006d01c03e", "metadata": {}, "outputs": [], @@ -1495,6 +1495,7 @@ "3. merge with daily weather data\n", "4. separate out y (counts for every day in program area) and X (column of counts for each community area outside of the program area)\n", "5. filter for pre-program rides.\n", + "- Research which model works best and which one is the most parallelizable\n", "6. create supervised model on all that data\n", "7. predict the next month or so of counts after sept 29 2021\n", "8. Graph predictions versus reality\n", @@ -1503,7 +1504,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 5, "id": "540c7bff-6eac-40c2-a9fe-9b6843f7d546", "metadata": {}, "outputs": [], @@ -1622,7 +1623,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 6, "id": "14b0b19a-36ad-4e17-bf30-1e9fcdaea452", "metadata": {}, "outputs": [], @@ -1634,24 +1635,10 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 7, "id": "388446e2-2cd4-4bb8-bfa0-f204b1359427", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "root\n", - " |-- day: integer (nullable = true)\n", - " |-- month: integer (nullable = true)\n", - " |-- year: integer (nullable = true)\n", - " |-- area: integer (nullable = true)\n", - " |-- dropoff_count: long (nullable = false)\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# Calculate daily counts for dropoff areas\n", "dropoff_counts = sample_df.groupby('day', 'month', 'year', 'dropoff_area').count().withColumnRenamed('count', 'dropoff_count')\n", @@ -1661,38 +1648,19 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 8, "id": "3e028095-2ff1-4e5f-aa03-7dafe417168e", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---+-----+----+----+-----------------+------------------+\n", - "|day|month|year|area|sum(pickup_count)|sum(dropoff_count)|\n", - "+---+-----+----+----+-----------------+------------------+\n", - "| 6| 4|2019| 76| 0| 1|\n", - "| 2| 6|2022| 49| 1| 0|\n", - "| 1| 11|2018| 7| 1| 0|\n", - "| 6| 4|2019| 22| 0| 1|\n", - "| 7| 3|2021| 7| 0| 1|\n", - "| 5| 9|2022| 32| 0| 1|\n", - "| 6| 5|2022| 6| 1| 0|\n", - "| 3| 9|2021| 33| 0| 1|\n", - "| 7| 2|2021| 77| 0| 1|\n", - "| 2| 7|2019| 29| 1| 0|\n", - "+---+-----+----+----+-----------------+------------------+\n", - "only showing top 10 rows\n", - "\n" - ] + "data": { + "text/plain": [ + "DataFrame[day: int, month: int, year: int, area: int, total_counts: bigint]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1707,7 +1675,13 @@ "combined_df = pickup_counts.union(dropoff_counts)\n", "\n", "# Group by day, month, year, and area, summing up the counts\n", - "daily_counts_by_area = combined_df.groupby('day', 'month', 'year', 'area').sum('pickup_count', 'dropoff_count')" + "daily_counts_by_area = combined_df.groupby('day', 'month', 'year', 'area').sum('pickup_count', 'dropoff_count')\n", + "\n", + "# the relatively smaller numbers are mostly a result of the sample size, should be fine when we \n", + "# make it to the entire dataframe\n", + "daily_counts_by_area = daily_counts_by_area.withColumn('total_counts', F.col('sum(pickup_count)') + F.col('sum(dropoff_count)'))\n", + "daily_counts_by_area.drop('sum(pickup_count)','sum(dropoff_count)')\n", + "#daily_counts_by_area.show(10)" ] }, { @@ -1746,12 +1720,7 @@ ] } ], - "source": [ - "daily_counts_by_area = daily_counts_by_area.withColumn('total_counts', F.col('sum(pickup_count)') + F.col('sum(dropoff_count)'))\n", - "# the relatively smaller numbers are mostly a result of the sample size, should be fine when we \n", - "# make it to the entire dataframe\n", - "daily_counts_by_area.show(10)" - ] + "source": [] }, { "cell_type": "markdown", @@ -1763,7 +1732,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 9, "id": "d57f2088-9ca6-4cc8-a8fe-0cca835fecf9", "metadata": {}, "outputs": [ @@ -1771,6 +1740,7 @@ "name": "stderr", "output_type": "stream", "text": [ + "23/11/22 22:08:31 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", " \r" ] }, @@ -1778,49 +1748,43 @@ "name": "stdout", "output_type": "stream", "textn", - "|day|month|year|1_sum(Hyde_Park)|1_sum(Kenwood)|1_sum(Woodlawn)|2_sum(Hyde_Park)|2_sum(Kenwood)|2_sum(Woodlawn)|3_sum(Hyde_Park)|3_sum(Kenwood)|3_sum(Woodlawn)|4_sum(Hyde_Park)|4_sum(Kenwood)|4_sum(Woodlawn)|5_sum(Hyde_Park)|5_sum(Kenwood)|5_sum(Woodlawn)|6_sum(Hyde_Park)|6_sum(Kenwood)|6_sum(Woodlawn)|7_sum(Hyde_Park)|7_sum(Kenwood)|7_sum(Woodlawn)|8_sum(Hyde_Park)|8_sum(Kenwood)|8_sum(Woodlawn)|10_sum(Hyde_Park)|10_sum(Kenwood)|10_sum(Woodlawn)|12_sum(Hyde_Park)|12_sum(Kenwood)|12_sum(Woodlawn)|15_sum(Hyde_Park)|15_sum(Kenwood)|15_sum(Woodlawn)|16_sum(Hyde_Park)|16_sum(Kenwood)|16_sum(Woodlawn)|17_sum(Hyde_Park)|17_sum(Kenwood)|17_sum(Woodlawn)|19_sum(Hyde_Park)|19_sum(Kenwood)|19_sum(Woodlawn)|21_sum(Hyde_Park)|21_sum(Kenwood)|21_sum(Woodlawn)|22_sum(Hyde_Park)|22_sum(Kenwood)|22_sum(Woodlawn)|23_sum(Hyde_Park)|23_sum(Kenwood)|23_sum(Woodlawn)|24_sum(Hyde_Park)|24_sum(Kenwood)|24_sum(Woodlawn)|25_sum(Hyde_Park)|25_sum(Kenwood)|25_sum(Woodlawn)|27_sum(Hyde_Park)|27_sum(Kenwood)|27_sum(Woodlawn)|28_sum(Hyde_Park)|28_sum(Kenwood)|28_sum(Woodlawn)|29_sum(Hyde_Park)|29_sum(Kenwood)|29_sum(Woodlawn)|30_sum(Hyde_Park)|30_sum(Kenwood)|30_sum(Woodlawn)|31_sum(Hyde_Park)|31_sum(Kenwood)|31_sum(Woodlawn)|32_sum(Hyde_Park)|32_sum(Kenwood)|32_sum(Woodlawn)|33_sum(Hyde_Park)|33_sum(Kenwood)|33_sum(Woodlawn)|34_sum(Hyde_Park)|34_sum(Kenwood)|34_sum(Woodlawn)|36_sum(Hyde_Park)|36_sum(Kenwood)|36_sum(Woodlawn)|38_sum(Hyde_Park)|38_sum(Kenwood)|38_sum(Woodlawn)|39_sum(Hyde_Park)|39_sum(Kenwood)|39_sum(Woodlawn)|40_sum(Hyde_Park)|40_sum(Kenwood)|40_sum(Woodlawn)|41_sum(Hyde_Park)|41_sum(Kenwood)|41_sum(Woodlawn)|42_sum(Hyde_Park)|42_sum(Kenwood)|42_sum(Woodlawn)|43_sum(Hyde_Park)|43_sum(Kenwood)|43_sum(Woodlawn)|44_sum(Hyde_Park)|44_sum(Kenwood)|44_sum(Woodlawn)|46_sum(Hyde_Park)|46_sum(Kenwood)|46_sum(Woodlawn)|50_sum(Hyde_Park)|50_sum(Kenwood)|50_sum(Woodlawn)|54_sum(Hyde_Park)|54_sum(Kenwood)|54_sum(Woodlawn)|58_sum(Hyde_Park)|58_sum(Kenwood)|58_sum(Woodlawn)|61_sum(Hyde_Park)|61_sum(Kenwood)|61_sum(Woodlawn)|63_sum(Hyde_Park)|63_sum(Kenwood)|63_sum(Woodlawn)|66_sum(Hyde_Park)|66_sum(Kenwood)|66_sum(Woodlawn)|69_sum(Hyde_Park)|69_sum(Kenwood)|69_sum(Woodlawn)|71_sum(Hyde_Park)|71_sum(Kenwood)|71_sum(Woodlawn)|74_sum(Hyde_Park)|74_sum(Kenwood)|74_sum(Woodlawn)|75_sum(Hyde_Park)|75_sum(Kenwood)|75_sum(Woodlawn)|76_sum(Hyde_Park)|76_sum(Kenwood)|76_sum(Woodlawn)|77_sum(Hyde_Park)|77_sum(Kenwood)|77_sum(Woodlawn)|\nn", - "| 2| 7|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 6| 1|2022| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null|\n", - "| 7| 1|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 3| 5|2023| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 1| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 1| 8|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null|\n", - "| 5| 5|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 6| 6|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 5| 7|2019| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 7| 8|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 4| 2|2023| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 3| 11|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 2| 2|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 7| 5|2022| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 5| 1|2023| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 6| 4|2019| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 6| 2|2019| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 1| 10|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 3| 1|2022| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 2| 6|2021| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\n", - "| 7| 9|2022| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| 0| 0| 0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|\nn", + "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", + "|day|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 13| 14| 16| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 38| 39| 41| 42| 43| 44| 46| 50| 52| 56| 59| 60| 61| 62| 63| 65| 66| 67| 68| 69| 70| 71| 72| 75| 76| 77|\n", + "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", + "| 2| 7|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|\n", + "| 4| 12|2022|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|\n", + "| 5| 10|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 3| 5|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null| 1|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 2| 12|2018|null|null|null|null|null|null|null|null|null|null|null|null| 1| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 6|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 7|2019|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 6| 6|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 3| 11|2022|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|\n", + "| 4| 2|2023|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 7| 5|2022|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 6| 1|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 1|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 1| 1|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|\n", + "| 6| 4|2019|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 12|2021|null|null|null|null|null| 1| 1| 1|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 1| 12|2021|null|null|null|null|null|null| 1| 1|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 6| 2|2019|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 4| 4|2023|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 2| 3|2019|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ - "## I'm doing something wrong here- double check. \n", - "# Area identifiers\n", - "hyde_park_id = 41\n", - "kenwood_id = 39\n", - "woodlawn_id = 42\n", + "#keep datetime as part of the pivot. \n", "\n", - "# Adding binary columns for each area\n", - "sample_df = sample_df.withColumn('Hyde_Park', F.when((F.col('pickup_area') == hyde_park_id) | (F.col('dropoff_area') == hyde_park_id), 1).otherwise(0))\n", - "sample_df = sample_df.withColumn('Kenwood', F.when((F.col('pickup_area') == kenwood_id) | (F.col('dropoff_area') == kenwood_id), 1).otherwise(0))\n", - "sample_df = sample_df.withColumn('Woodlawn', F.when((F.col('pickup_area') == woodlawn_id) | (F.col('dropoff_area') == woodlawn_id), 1).otherwise(0))\n", + "# pivot so that each community area is a column\n", + "# one row for each day, each column represents a community area (with its entry being daily count of rides for that area).\n", "\n", "# Pivot the DataFrame\n", - "pivoted_df = sample_df.groupBy(\"day\", \"month\", \"year\").pivot(\"pickup_area\").sum(\"Hyde_Park\", \"Kenwood\", \"Woodlawn\")\n", + "pivoted_df = daily_counts_by_area.groupBy(\"day\", \"month\", \"year\").pivot(\"area\").sum(\"total_counts\")\n", "\n", "# Show the results\n", "pivoted_df.show()" @@ -1857,9 +1821,7 @@ "source": [ "# the output of the sample df above looks off. investigate\n", "\n", - "# pivot so that each area is a column\n", - "# should probably create a new variable that denotes in program rides, and figure out what combination of pickup or dropoff area we want to u\n", - "pivoted_df = sample_df.groupBy(\"day\",\"month\",\"year\").pivot(\"dropoff_area\").agg({\"count\": \"first\"})" + "#pivoted_df = sample_df.groupBy(\"day\",\"month\",\"year\").pivot(\"dropoff_area\").agg({\"count\": \"first\"})" ] }, { @@ -1874,7 +1836,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 11, "id": "b501c963-15c8-4341-b9c6-7d2f07cc5015", "metadata": {}, "outputs": [ @@ -1884,55 +1846,44 @@ "text": [ " \r" ] - }, + } + ], + "source": [ + "df_weather_1 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv\", inferSchema=True, header=True)\n", + "df_weather_2 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n", + "df_weather_3 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2022-09-01 to 2022-12-31.csv\", inferSchema=True, header=True)\n", + "# add 2023 data\n", + "df_weather = df_weather_1.union(df_weather_2).union(df_weather_3)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2858a6da-ef1b-4561-b8e2-f93073b8e803", + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- name: string (nullable = true)\n", - " |-- datetime: string (nullable = true)\n", - " |-- tempmax: double (nullable = true)\n", - " |-- tempmin: double (nullable = true)\n", + " |-- datetime: date (nullable = true)\n", " |-- temp: double (nullable = true)\n", - " |-- feelslikemax: double (nullable = true)\n", - " |-- feelslikemin: double (nullable = true)\n", - " |-- feelslike: double (nullable = true)\n", - " |-- dew: double (nullable = true)\n", - " |-- humidity: double (nullable = true)\n", " |-- precip: double (nullable = true)\n", - " |-- precipprob: integer (nullable = true)\n", - " |-- precipcover: double (nullable = true)\n", - " |-- preciptype: string (nullable = true)\n", " |-- snow: double (nullable = true)\n", " |-- snowdepth: double (nullable = true)\n", - " |-- windgust: double (nullable = true)\n", - " |-- windspeed: double (nullable = true)\n", - " |-- winddir: double (nullable = true)\n", - " |-- sealevelpressure: double (nullable = true)\n", - " |-- cloudcover: double (nullable = true)\n", - " |-- visibility: double (nullable = true)\n", - " |-- solarradiation: double (nullable = true)\n", - " |-- solarenergy: double (nullable = true)\n", - " |-- uvindex: integer (nullable = true)\n", - " |-- severerisk: string (nullable = true)\n", - " |-- sunrise: timestamp (nullable = true)\n", " |-- sunset: timestamp (nullable = true)\n", - " |-- moonphase: double (nullable = true)\n", - " |-- conditions: string (nullable = true)\n", - " |-- description: string (nullable = true)\n", - " |-- icon: string (nullable = true)\n", - " |-- stations: string (nullable = true)\n", "\n" ] } ], "source": [ - "df_weather_1 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv\", inferSchema=True, header=True)\n", - "df_weather_2 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n", - "df_weather_3 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2022-09-01 to 2022-12-31.csv\", inferSchema=True, header=True)\n", - "df_weather = df_weather_1.union(df_weather_2).union(df_weather_3)\n", - "df_weather.printSchema()" + "df_weather = df_weather.select('name', 'datetime', 'temp','precip','snow','snowdepth','sunset')\n", + "df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], \"yyyy-MM-dd\"))\n", + "df_weather.printSchema()\n", + "# name, datetime, temp, precip, snow, snowdepth, sunset.\n", + "# merge on datetime- keep datetime as part of the pivot. " ] }, { @@ -1940,6 +1891,31 @@ "execution_count": null, "id": "c2d5f67e-1122-4208-b82a-b7fc8ef7f04d", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 66:===========> (470 + 10) / 534][Stage 67:===========> (469 + 10) / 534]\r" + ] + } + ], + "source": [ + "# Create a new column 'date_of_rides' by combining day, month, and year columns\n", + "pivoted_df_with_date = pivoted_df.withColumn('date_of_rides', F.to_date(\n", + " F.concat_ws('-', F.col('year'), F.col('month'), F.col('day')),\n", + " 'yyyy-MM-dd'\n", + "))\n", + "\n", + "# Show the DataFrame with the new date column\n", + "pivoted_df_with_date.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa29201c-1d9b-493b-8378-558ae294f38e", + "metadata": {}, "outputs": [], "source": [] }