diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb index 1943900..adc42d6 100644 --- a/supervised_ml.ipynb +++ b/supervised_ml.ipynb @@ -7,14 +7,14 @@ "source": [ "# Supervised ML\n", "\n", - "The goal of this model is to predict the ridership that occurs within the University of Chicago Lyft Program Area. We will do this by using as features the ridership counts of other Chicago community areas, as well as using weather. The labels are the daily ridership counts within the program area.\n", + "The goal of this model is to predict the ridership that occurs within the University of Chicago Lyft Program Area. We will do this by using as features the ridership counts of other Chicago community areas, as well as adding daily weather data and key weather variables that might affect ridership counts. The labels are the daily ridership counts within the program area.\n", "\n", - "We will create the model that functions up until the introduction of the University Lyft program and then look at the difference between the predictions and the actual ridership as a rough estimate of the effect of the program on rideshare usage in the area. We will do this by looking at both the change when the program was introduced, as well as when the program was reduced from 10 rides of up to 15 dollars each, to 7 rides up to 10 dollars. " + "We will create the model that functions up until the introduction of the University Lyft program and then look at the difference between the predictions and the actual ridership as a rough estimate of the effect of the program on rideshare usage in the area. We will do this by looking at both the change when the program was initially introduced (free rides upto 15 dollars each only on weekends), when the program was expanded to cover all days (10 rides of up to 15 dollars each) and when the program was reduced from to 7 rides up to 10 dollars in Summer 2023. " ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "414550d7-9d43-4f5c-8f75-4756974014af", "metadata": {}, "outputs": [ @@ -26,19 +26,22 @@ " ('spark.eventLog.enabled', 'true'),\n", " ('spark.submit.pyFiles',\n", " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/9596200d-6e6e-4d74-a57e-00bf53fa6d0e/spark-job-history'),\n", " ('spark.driver.host',\n", " 'hub-msca-bdp-dphub-students-test-harshpachisia-m.c.msca-bdp-student-ap.internal'),\n", " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", " ('spark.sql.autoBroadcastJoinThreshold', '191m'),\n", + " ('spark.app.id', 'application_1701126718231_0001'),\n", " ('spark.kryoserializer.buffer.max', '2000M'),\n", " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", " ('spark.driver.maxResultSize', '0'),\n", " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", + " 'http://hub-msca-bdp-dphub-students-test-harshpachisia-m:8088/proxy/application_1701126718231_0001'),\n", " ('spark.ui.filters',\n", " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", - " 'http://hub-msca-bdp-dphub-students-test-harshpachisia-m:8088/proxy/application_1701015020482_0001'),\n", " ('spark.metrics.namespace',\n", " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", " ('spark.executor.memory', '4g'),\n", @@ -47,7 +50,6 @@ " ('spark.executor.id', 'driver'),\n", " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", - " ('spark.app.id', 'application_1701015020482_0001'),\n", " ('spark.yarn.dist.pyFiles',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.yarn.historyServer.address',\n", @@ -56,13 +58,13 @@ " ('spark.app.name', 'Spark Updated Conf'),\n", " ('spark.sql.catalogImplementation', 'hive'),\n", " ('spark.driver.appUIAddress',\n", - " 'http://hub-msca-bdp-dphub-students-test-harshpachisia-m.c.msca-bdp-student-ap.internal:40547'),\n", + " 'http://hub-msca-bdp-dphub-students-test-harshpachisia-m.c.msca-bdp-student-ap.internal:36141'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1701126718231_0001'),\n", " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", " 'hub-msca-bdp-dphub-students-test-harshpachisia-m'),\n", " ('spark.yarn.secondary.jars',\n", " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", - " ('spark.driver.port', '41051'),\n", " ('spark.repl.local.jars',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.sql.cbo.enabled', 'true'),\n", @@ -74,9 +76,9 @@ " ('spark.driver.memory', '4g'),\n", " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", + " ('spark.app.startTime', '1701129773992'),\n", " ('spark.yarn.am.memory', '640m'),\n", - " ('spark.history.fs.logDirectory',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/d5b80d11-0ab8-413d-9bcb-2259e97b7515/spark-job-history'),\n", + " ('spark.driver.port', '40637'),\n", " ('spark.cores.max', '4'),\n", " ('spark.executor.cores', '4'),\n", " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", @@ -86,6 +88,8 @@ " ('spark.executor.instances', '2'),\n", " ('spark.dataproc.listeners',\n", " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/9596200d-6e6e-4d74-a57e-00bf53fa6d0e/spark-job-history'),\n", " ('spark.serializer.objectStreamReset', '100'),\n", " ('spark.submit.deployMode', 'client'),\n", " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", @@ -94,21 +98,17 @@ " ('spark.sql.adaptive.enabled', 'true'),\n", " ('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),\n", " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", - " ('spark.eventLog.dir',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/d5b80d11-0ab8-413d-9bcb-2259e97b7515/spark-job-history'),\n", - " ('spark.app.startTime', '1701015180695'),\n", " ('spark.master', 'yarn'),\n", " ('spark.ui.port', '0'),\n", " ('spark.rpc.message.maxSize', '512'),\n", " ('spark.rdd.compress', 'True'),\n", " ('spark.task.maxFailures', '10'),\n", - " ('spark.ui.proxyBase', '/proxy/application_1701015020482_0001'),\n", " ('spark.yarn.isPython', 'true'),\n", " ('spark.dynamicAllocation.enabled', 'true'),\n", " ('spark.ui.showConsoleProgress', 'true')]" ] }, - "execution_count": 3, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -117,7 +117,6 @@ "# read in packages create spark environment\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql import functions as F\n", - "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", @@ -130,6 +129,14 @@ "spark.sparkContext.getConf().getAll()" ] }, + { + "cell_type": "markdown", + "id": "6932b302-6ff0-49ba-a436-8a48d93f0f92", + "metadata": {}, + "source": [ + "## Data Processing" + ] + }, { "cell_type": "markdown", "id": "17ac5c49-7dbc-4aac-a702-1a5ac3ee0097", @@ -179,7 +186,7 @@ "df_2022 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2022.csv\", inferSchema=True, header=True)\n", "df_2023 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2023.csv\", inferSchema=True, header=True)\n", "\n", - "# dropping new columns in 2023\n", + "# dropping new columns that are only in 2023\n", "df_2023 = df_2023.drop('Shared Trip Match','Percent Time Chicago','Percent Distance Chicago')\n", "\n", "df_all = df_2018.union(df_2019).union(df_2021).union(df_2022).union(df_2023)\n", @@ -188,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "18e30586-4bdd-4217-b55d-e41522df062b", "metadata": {}, "outputs": [ @@ -203,7 +210,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 32:=====================================================>(543 + 1) / 544]\r" + "[Stage 16:=====================================================>(541 + 3) / 544]\r" ] }, { @@ -234,206 +241,6 @@ "| 24|306633|\n", "| 23|306731|\n", "| 22|307226|\n", - "| 243|328837|\n", - "| 242|328975|\n", - "| 241|329131|\n", - "| 240|329163|\n", - "| 239|329209|\n", - "| 237|329245|\n", - "| 235|329263|\n", - "| 238|329263|\n", - "| 234|329311|\n", - "| 236|329315|\n", - "| 232|329332|\n", - "| 233|329344|\n", - "| 231|329373|\n", - "| 228|329389|\n", - "| 229|329390|\n", - "| 227|329399|\n", - "| 226|329410|\n", - "| 225|329410|\n", - "| 224|329418|\n", - "| 230|329427|\n", - "| 223|329428|\n", - "| 220|329461|\n", - "| 222|329481|\n", - "| 221|329505|\n", - "| 217|329507|\n", - "| 218|329513|\n", - "| 219|329519|\n", - "| 216|329523|\n", - "| 214|329533|\n", - "| 213|329555|\n", - "| 215|329574|\n", - "| 211|329587|\n", - "| 212|329591|\n", - "| 208|329607|\n", - "| 210|329623|\n", - "| 206|329624|\n", - "| 209|329630|\n", - "| 207|329633|\n", - "| 205|329646|\n", - "| 202|329654|\n", - "| 204|329673|\n", - "| 203|329678|\n", - "| 194|329704|\n", - "| 201|329708|\n", - "| 200|329712|\n", - "| 191|329717|\n", - "| 189|329728|\n", - "| 188|329730|\n", - "| 193|329732|\n", - "| 199|329732|\n", - "| 198|329739|\n", - "| 190|329746|\n", - "| 195|329748|\n", - "| 197|329786|\n", - "| 196|329786|\n", - "| 186|329786|\n", - "| 187|329787|\n", - "| 192|329793|\n", - "| 183|329804|\n", - "| 184|329805|\n", - "| 179|329807|\n", - "| 182|329814|\n", - "| 181|329819|\n", - "| 185|329843|\n", - "| 178|329849|\n", - "| 177|329861|\n", - "| 180|329865|\n", - "| 172|329866|\n", - "| 173|329871|\n", - "| 171|329879|\n", - "| 175|329889|\n", - "| 174|329892|\n", - "| 176|329900|\n", - "| 170|329937|\n", - "| 168|329956|\n", - "| 162|329961|\n", - "| 167|329962|\n", - "| 164|329965|\n", - "| 169|329974|\n", - "| 165|329980|\n", - "| 163|329985|\n", - "| 159|329993|\n", - "| 166|330000|\n", - "| 158|330005|\n", - "| 161|330014|\n", - "| 157|330021|\n", - "| 160|330027|\n", - "| 156|330059|\n", - "| 154|330066|\n", - "| 147|330070|\n", - "| 153|330083|\n", - "| 155|330084|\n", - "| 150|330084|\n", - "| 146|330089|\n", - "| 152|330090|\n", - "| 149|330099|\n", - "| 151|330101|\n", - "| 148|330103|\n", - "| 144|330116|\n", - "| 145|330129|\n", - "| 140|330133|\n", - "| 143|330135|\n", - "| 141|330139|\n", - "| 142|330148|\n", - "| 139|330159|\n", - "| 135|330178|\n", - "| 137|330181|\n", - "| 132|330186|\n", - "| 138|330189|\n", - "| 136|330205|\n", - "| 129|330208|\n", - "| 133|330219|\n", - "| 125|330220|\n", - "| 128|330227|\n", - "| 134|330230|\n", - "| 127|330236|\n", - "| 130|330243|\n", - "| 124|330245|\n", - "| 117|330249|\n", - "| 131|330252|\n", - "| 121|330255|\n", - "| 122|330258|\n", - "| 126|330268|\n", - "| 123|330270|\n", - "| 118|330275|\n", - "| 115|330282|\n", - "| 113|330282|\n", - "| 120|330296|\n", - "| 112|330297|\n", - "| 119|330306|\n", - "| 114|330308|\n", - "| 104|330338|\n", - "| 116|330339|\n", - "| 109|330355|\n", - "| 105|330363|\n", - "| 111|330366|\n", - "| 110|330372|\n", - "| 108|330383|\n", - "| 107|330385|\n", - "| 106|330402|\n", - "| 96|330413|\n", - "| 102|330415|\n", - "| 100|330418|\n", - "| 98|330418|\n", - "| 101|330419|\n", - "| 94|330425|\n", - "| 97|330426|\n", - "| 103|330430|\n", - "| 99|330430|\n", - "| 95|330436|\n", - "| 90|330456|\n", - "| 92|330470|\n", - "| 88|330471|\n", - "| 91|330474|\n", - "| 93|330477|\n", - "| 87|330494|\n", - "| 86|330501|\n", - "| 89|330512|\n", - "| 84|330524|\n", - "| 82|330550|\n", - "| 80|330570|\n", - "| 85|330570|\n", - "| 81|330579|\n", - "| 83|330585|\n", - "| 78|330622|\n", - "| 79|330625|\n", - "| 76|330625|\n", - "| 75|330642|\n", - "| 77|330646|\n", - "| 71|330651|\n", - "| 74|330653|\n", - "| 73|330667|\n", - "| 70|330690|\n", - "| 65|330704|\n", - "| 72|330709|\n", - "| 67|330721|\n", - "| 66|330724|\n", - "| 69|330737|\n", - "| 62|330758|\n", - "| 63|330762|\n", - "| 64|330762|\n", - "| 68|330766|\n", - "| 60|330782|\n", - "| 59|330784|\n", - "| 56|330801|\n", - "| 57|330805|\n", - "| 61|330807|\n", - "| 58|330837|\n", - "| 53|330868|\n", - "| 55|330869|\n", - "| 54|330885|\n", - "| 52|330918|\n", - "| 50|330944|\n", - "| 51|330963|\n", - "| 49|331028|\n", - "| 48|331034|\n", - "| 47|331050|\n", - "| 46|331114|\n", - "| 45|331284|\n", - "| 44|331416|\n", "| 543|364094|\n", "| 542|364374|\n", "| 541|364493|\n", @@ -602,8 +409,8 @@ "| 300|422114|\n", "| 301|422116|\n", "| 295|422134|\n", - "| 299|422155|\n", "| 296|422155|\n", + "| 299|422155|\n", "| 290|422185|\n", "| 297|422193|\n", "| 294|422194|\n", @@ -657,6 +464,206 @@ "| 245|423403|\n", "| 244|423762|\n", "| 43|457702|\n", + "| 243|482479|\n", + "| 242|482717|\n", + "| 241|482721|\n", + "| 240|482839|\n", + "| 239|482895|\n", + "| 238|483016|\n", + "| 237|483044|\n", + "| 236|483191|\n", + "| 234|483230|\n", + "| 235|483231|\n", + "| 233|483252|\n", + "| 228|483254|\n", + "| 230|483288|\n", + "| 229|483290|\n", + "| 232|483295|\n", + "| 231|483298|\n", + "| 227|483352|\n", + "| 226|483392|\n", + "| 225|483392|\n", + "| 223|483457|\n", + "| 224|483474|\n", + "| 221|483479|\n", + "| 222|483484|\n", + "| 218|483509|\n", + "| 219|483533|\n", + "| 220|483533|\n", + "| 217|483561|\n", + "| 216|483591|\n", + "| 215|483615|\n", + "| 213|483641|\n", + "| 214|483653|\n", + "| 212|483662|\n", + "| 211|483694|\n", + "| 209|483727|\n", + "| 206|483757|\n", + "| 210|483759|\n", + "| 207|483767|\n", + "| 208|483779|\n", + "| 205|483801|\n", + "| 203|483802|\n", + "| 204|483817|\n", + "| 201|483841|\n", + "| 200|483867|\n", + "| 199|483876|\n", + "| 192|483888|\n", + "| 202|483889|\n", + "| 197|483906|\n", + "| 198|483906|\n", + "| 196|483916|\n", + "| 193|483926|\n", + "| 191|483931|\n", + "| 194|483944|\n", + "| 187|483975|\n", + "| 190|483975|\n", + "| 195|483980|\n", + "| 188|483986|\n", + "| 186|483989|\n", + "| 189|483991|\n", + "| 183|484002|\n", + "| 184|484016|\n", + "| 180|484036|\n", + "| 185|484039|\n", + "| 176|484040|\n", + "| 175|484041|\n", + "| 177|484041|\n", + "| 181|484042|\n", + "| 179|484049|\n", + "| 178|484055|\n", + "| 172|484080|\n", + "| 174|484081|\n", + "| 182|484089|\n", + "| 173|484102|\n", + "| 167|484105|\n", + "| 171|484109|\n", + "| 165|484117|\n", + "| 161|484120|\n", + "| 170|484121|\n", + "| 166|484122|\n", + "| 164|484123|\n", + "| 168|484128|\n", + "| 163|484140|\n", + "| 169|484147|\n", + "| 159|484150|\n", + "| 160|484163|\n", + "| 162|484193|\n", + "| 158|484211|\n", + "| 157|484238|\n", + "| 150|484256|\n", + "| 154|484260|\n", + "| 152|484263|\n", + "| 149|484272|\n", + "| 156|484276|\n", + "| 153|484278|\n", + "| 145|484289|\n", + "| 155|484294|\n", + "| 151|484305|\n", + "| 147|484307|\n", + "| 146|484319|\n", + "| 138|484336|\n", + "| 144|484348|\n", + "| 148|484348|\n", + "| 139|484350|\n", + "| 142|484351|\n", + "| 141|484356|\n", + "| 143|484360|\n", + "| 137|484390|\n", + "| 133|484420|\n", + "| 136|484421|\n", + "| 140|484422|\n", + "| 132|484433|\n", + "| 135|484434|\n", + "| 131|484458|\n", + "| 124|484463|\n", + "| 134|484468|\n", + "| 129|484474|\n", + "| 130|484482|\n", + "| 125|484495|\n", + "| 128|484510|\n", + "| 122|484516|\n", + "| 121|484545|\n", + "| 127|484559|\n", + "| 123|484574|\n", + "| 126|484591|\n", + "| 120|484595|\n", + "| 118|484616|\n", + "| 119|484633|\n", + "| 117|484640|\n", + "| 111|484651|\n", + "| 113|484660|\n", + "| 112|484666|\n", + "| 114|484667|\n", + "| 116|484668|\n", + "| 110|484669|\n", + "| 115|484684|\n", + "| 107|484695|\n", + "| 109|484698|\n", + "| 106|484714|\n", + "| 108|484717|\n", + "| 104|484719|\n", + "| 101|484727|\n", + "| 105|484731|\n", + "| 102|484740|\n", + "| 103|484764|\n", + "| 99|484769|\n", + "| 100|484769|\n", + "| 97|484782|\n", + "| 98|484800|\n", + "| 96|484827|\n", + "| 94|484830|\n", + "| 93|484856|\n", + "| 95|484857|\n", + "| 91|484866|\n", + "| 87|484874|\n", + "| 92|484881|\n", + "| 88|484885|\n", + "| 89|484899|\n", + "| 90|484938|\n", + "| 85|484963|\n", + "| 84|484969|\n", + "| 86|484975|\n", + "| 79|484990|\n", + "| 82|484997|\n", + "| 83|484999|\n", + "| 80|485012|\n", + "| 81|485020|\n", + "| 77|485023|\n", + "| 75|485037|\n", + "| 78|485059|\n", + "| 76|485061|\n", + "| 74|485128|\n", + "| 69|485144|\n", + "| 70|485147|\n", + "| 73|485161|\n", + "| 71|485167|\n", + "| 68|485170|\n", + "| 72|485170|\n", + "| 67|485241|\n", + "| 64|485241|\n", + "| 66|485245|\n", + "| 65|485284|\n", + "| 63|485301|\n", + "| 62|485354|\n", + "| 61|485366|\n", + "| 59|485403|\n", + "| 60|485433|\n", + "| 58|485459|\n", + "| 56|485560|\n", + "| 57|485568|\n", + "| 55|485597|\n", + "| 54|485597|\n", + "| 53|485628|\n", + "| 52|485637|\n", + "| 50|485672|\n", + "| 51|485700|\n", + "| 49|485712|\n", + "| 48|485739|\n", + "| 47|485876|\n", + "| 46|485879|\n", + "| 45|486271|\n", + "| 44|486710|\n", "| 443|569570|\n", "| 442|570154|\n", "| 441|570301|\n", @@ -723,8 +730,8 @@ "| 380|572604|\n", "| 379|572634|\n", "| 378|572647|\n", - "| 375|572742|\n", "| 376|572742|\n", + "| 375|572742|\n", "| 377|572755|\n", "| 374|572798|\n", "| 372|572800|\n", @@ -787,14 +794,654 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "abf8091a-9662-4378-8fe5-b2ece46a6a14", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 19:=====================================================>(542 + 2) / 544]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Partitions: 600\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 22:===================================================> (586 + 14) / 600]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+------+\n", + "|partitionId| count|\n", + "+-----------+------+\n", + "| 24|413573|\n", + "| 12|413574|\n", + "| 25|413574|\n", + "| 532|413574|\n", + "| 9|413574|\n", + "| 13|413574|\n", + "| 15|413574|\n", + "| 10|413575|\n", + "| 531|413575|\n", + "| 560|413575|\n", + "| 534|413575|\n", + "| 538|413575|\n", + "| 528|413575|\n", + "| 11|413575|\n", + "| 14|413575|\n", + "| 535|413575|\n", + "| 17|413575|\n", + "| 18|413575|\n", + "| 23|413575|\n", + "| 16|413575|\n", + "| 530|413575|\n", + "| 533|413575|\n", + "| 26|413575|\n", + "| 20|413575|\n", + "| 554|413575|\n", + "| 8|413575|\n", + "| 539|413575|\n", + "| 527|413576|\n", + "| 21|413576|\n", + "| 536|413576|\n", + "| 525|413576|\n", + "| 561|413576|\n", + "| 537|413576|\n", + "| 19|413576|\n", + "| 529|413576|\n", + "| 22|413576|\n", + "| 559|413576|\n", + "| 27|413576|\n", + "| 599|413577|\n", + "| 596|413577|\n", + "| 29|413577|\n", + "| 5|413577|\n", + "| 28|413577|\n", + "| 553|413577|\n", + "| 541|413577|\n", + "| 32|413577|\n", + "| 556|413577|\n", + "| 558|413577|\n", + "| 526|413577|\n", + "| 555|413577|\n", + "| 540|413577|\n", + "| 594|413577|\n", + "| 544|413577|\n", + "| 557|413577|\n", + "| 6|413577|\n", + "| 524|413577|\n", + "| 598|413577|\n", + "| 597|413577|\n", + "| 593|413577|\n", + "| 549|413578|\n", + "| 7|413578|\n", + "| 31|413578|\n", + "| 545|413578|\n", + "| 38|413578|\n", + "| 542|413578|\n", + "| 592|413578|\n", + "| 1|413578|\n", + "| 595|413578|\n", + "| 591|413578|\n", + "| 33|413578|\n", + "| 30|413578|\n", + "| 317|413578|\n", + "| 567|413578|\n", + "| 562|413578|\n", + "| 565|413578|\n", + "| 0|413578|\n", + "| 552|413578|\n", + "| 2|413578|\n", + "| 566|413578|\n", + "| 462|413578|\n", + "| 543|413578|\n", + "| 564|413578|\n", + "| 3|413578|\n", + "| 563|413578|\n", + "| 461|413579|\n", + "| 587|413579|\n", + "| 550|413579|\n", + "| 586|413579|\n", + "| 463|413579|\n", + "| 569|413579|\n", + "| 548|413579|\n", + "| 551|413579|\n", + "| 315|413579|\n", + "| 39|413579|\n", + "| 4|413579|\n", + "| 318|413579|\n", + "| 547|413579|\n", + "| 64|413579|\n", + "| 460|413580|\n", + "| 35|413580|\n", + "| 546|413580|\n", + "| 459|413580|\n", + "| 37|413580|\n", + "| 589|413580|\n", + "| 590|413580|\n", + "| 63|413580|\n", + "| 321|413580|\n", + "| 319|413580|\n", + "| 316|413580|\n", + "| 523|413580|\n", + "| 568|413580|\n", + "| 465|413580|\n", + "| 464|413580|\n", + "| 583|413580|\n", + "| 585|413580|\n", + "| 65|413580|\n", + "| 67|413580|\n", + "| 322|413580|\n", + "| 313|413580|\n", + "| 66|413580|\n", + "| 466|413580|\n", + "| 571|413580|\n", + "| 584|413580|\n", + "| 570|413580|\n", + "| 580|413580|\n", + "| 467|413580|\n", + "| 34|413580|\n", + "| 40|413580|\n", + "| 588|413580|\n", + "| 320|413581|\n", + "| 36|413581|\n", + "| 314|413581|\n", + "| 312|413581|\n", + "| 41|413581|\n", + "| 513|413581|\n", + "| 70|413581|\n", + "| 324|413581|\n", + "| 512|413581|\n", + "| 522|413581|\n", + "| 454|413581|\n", + "| 469|413581|\n", + "| 300|413581|\n", + "| 581|413581|\n", + "| 106|413581|\n", + "| 68|413581|\n", + "| 468|413581|\n", + "| 521|413581|\n", + "| 517|413581|\n", + "| 511|413581|\n", + "| 572|413581|\n", + "| 110|413582|\n", + "| 471|413582|\n", + "| 578|413582|\n", + "| 507|413582|\n", + "| 302|413582|\n", + "| 42|413582|\n", + "| 452|413582|\n", + "| 368|413582|\n", + "| 107|413582|\n", + "| 309|413582|\n", + "| 360|413582|\n", + "| 576|413582|\n", + "| 152|413582|\n", + "| 457|413582|\n", + "| 109|413582|\n", + "| 323|413582|\n", + "| 366|413582|\n", + "| 108|413582|\n", + "| 151|413582|\n", + "| 508|413582|\n", + "| 304|413582|\n", + "| 105|413582|\n", + "| 582|413582|\n", + "| 305|413582|\n", + "| 61|413582|\n", + "| 515|413582|\n", + "| 49|413582|\n", + "| 520|413582|\n", + "| 326|413582|\n", + "| 307|413582|\n", + "| 514|413582|\n", + "| 52|413582|\n", + "| 519|413582|\n", + "| 111|413582|\n", + "| 573|413582|\n", + "| 71|413582|\n", + "| 458|413582|\n", + "| 74|413582|\n", + "| 308|413582|\n", + "| 48|413582|\n", + "| 456|413582|\n", + "| 311|413582|\n", + "| 453|413582|\n", + "| 136|413582|\n", + "| 55|413582|\n", + "| 153|413582|\n", + "| 146|413582|\n", + "| 365|413582|\n", + "| 134|413582|\n", + "| 451|413582|\n", + "| 69|413582|\n", + "| 455|413582|\n", + "| 579|413582|\n", + "| 54|413582|\n", + "| 577|413582|\n", + "| 470|413582|\n", + "| 301|413582|\n", + "| 516|413582|\n", + "| 509|413583|\n", + "| 56|413583|\n", + "| 373|413583|\n", + "| 75|413583|\n", + "| 147|413583|\n", + "| 60|413583|\n", + "| 295|413583|\n", + "| 574|413583|\n", + "| 138|413583|\n", + "| 296|413583|\n", + "| 59|413583|\n", + "| 133|413583|\n", + "| 144|413583|\n", + "| 50|413583|\n", + "| 47|413583|\n", + "| 43|413583|\n", + "| 361|413583|\n", + "| 160|413583|\n", + "| 364|413583|\n", + "| 510|413583|\n", + "| 359|413583|\n", + "| 155|413583|\n", + "| 72|413583|\n", + "| 148|413583|\n", + "| 135|413583|\n", + "| 116|413583|\n", + "| 169|413583|\n", + "| 145|413583|\n", + "| 310|413583|\n", + "| 62|413583|\n", + "| 51|413583|\n", + "| 303|413583|\n", + "| 170|413583|\n", + "| 306|413583|\n", + "| 297|413583|\n", + "| 118|413583|\n", + "| 325|413583|\n", + "| 208|413583|\n", + "| 506|413583|\n", + "| 294|413583|\n", + "| 57|413583|\n", + "| 112|413583|\n", + "| 137|413583|\n", + "| 372|413583|\n", + "| 44|413583|\n", + "| 518|413583|\n", + "| 164|413583|\n", + "| 53|413583|\n", + "| 367|413583|\n", + "| 168|413583|\n", + "| 327|413583|\n", + "| 502|413584|\n", + "| 473|413584|\n", + "| 165|413584|\n", + "| 163|413584|\n", + "| 358|413584|\n", + "| 58|413584|\n", + "| 505|413584|\n", + "| 371|413584|\n", + "| 45|413584|\n", + "| 73|413584|\n", + "| 503|413584|\n", + "| 363|413584|\n", + "| 171|413584|\n", + "| 472|413584|\n", + "| 475|413584|\n", + "| 129|413584|\n", + "| 120|413584|\n", + "| 154|413584|\n", + "| 374|413584|\n", + "| 449|413584|\n", + "| 77|413584|\n", + "| 329|413584|\n", + "| 113|413584|\n", + "| 476|413584|\n", + "| 362|413584|\n", + "| 214|413584|\n", + "| 132|413584|\n", + "| 221|413584|\n", + "| 115|413584|\n", + "| 369|413584|\n", + "| 298|413584|\n", + "| 370|413584|\n", + "| 206|413584|\n", + "| 477|413584|\n", + "| 156|413584|\n", + "| 76|413584|\n", + "| 287|413584|\n", + "| 140|413584|\n", + "| 575|413584|\n", + "| 149|413584|\n", + "| 104|413584|\n", + "| 328|413584|\n", + "| 46|413584|\n", + "| 215|413584|\n", + "| 79|413584|\n", + "| 166|413584|\n", + "| 299|413584|\n", + "| 205|413584|\n", + "| 173|413584|\n", + "| 450|413584|\n", + "| 378|413585|\n", + "| 431|413585|\n", + "| 161|413585|\n", + "| 172|413585|\n", + "| 159|413585|\n", + "| 504|413585|\n", + "| 220|413585|\n", + "| 423|413585|\n", + "| 102|413585|\n", + "| 479|413585|\n", + "| 210|413585|\n", + "| 101|413585|\n", + "| 331|413585|\n", + "| 162|413585|\n", + "| 131|413585|\n", + "| 258|413585|\n", + "| 130|413585|\n", + "| 186|413585|\n", + "| 422|413585|\n", + "| 410|413585|\n", + "| 233|413585|\n", + "| 119|413585|\n", + "| 157|413585|\n", + "| 128|413585|\n", + "| 103|413585|\n", + "| 209|413585|\n", + "| 117|413585|\n", + "| 425|413585|\n", + "| 114|413585|\n", + "| 375|413585|\n", + "| 175|413585|\n", + "| 207|413585|\n", + "| 293|413585|\n", + "| 219|413585|\n", + "| 139|413585|\n", + "| 167|413585|\n", + "| 254|413585|\n", + "| 478|413585|\n", + "| 150|413585|\n", + "| 127|413585|\n", + "| 377|413585|\n", + "| 376|413585|\n", + "| 121|413585|\n", + "| 78|413585|\n", + "| 174|413585|\n", + "| 216|413585|\n", + "| 474|413585|\n", + "| 448|413585|\n", + "| 447|413585|\n", + "| 330|413585|\n", + "| 142|413586|\n", + "| 278|413586|\n", + "| 212|413586|\n", + "| 122|413586|\n", + "| 426|413586|\n", + "| 292|413586|\n", + "| 337|413586|\n", + "| 236|413586|\n", + "| 234|413586|\n", + "| 178|413586|\n", + "| 179|413586|\n", + "| 237|413586|\n", + "| 444|413586|\n", + "| 249|413586|\n", + "| 126|413586|\n", + "| 228|413586|\n", + "| 213|413586|\n", + "| 222|413586|\n", + "| 285|413586|\n", + "| 250|413586|\n", + "| 432|413586|\n", + "| 379|413586|\n", + "| 217|413586|\n", + "| 81|413586|\n", + "| 125|413586|\n", + "| 427|413586|\n", + "| 339|413586|\n", + "| 204|413586|\n", + "| 243|413586|\n", + "| 223|413586|\n", + "| 238|413586|\n", + "| 284|413586|\n", + "| 158|413586|\n", + "| 232|413586|\n", + "| 218|413586|\n", + "| 430|413586|\n", + "| 357|413586|\n", + "| 255|413586|\n", + "| 332|413586|\n", + "| 241|413586|\n", + "| 80|413586|\n", + "| 288|413586|\n", + "| 338|413586|\n", + "| 253|413586|\n", + "| 286|413586|\n", + "| 141|413586|\n", + "| 411|413586|\n", + "| 480|413586|\n", + "| 416|413586|\n", + "| 256|413586|\n", + "| 424|413586|\n", + "| 291|413586|\n", + "| 446|413586|\n", + "| 280|413586|\n", + "| 242|413586|\n", + "| 445|413586|\n", + "| 211|413586|\n", + "| 415|413586|\n", + "| 100|413586|\n", + "| 334|413587|\n", + "| 245|413587|\n", + "| 224|413587|\n", + "| 281|413587|\n", + "| 86|413587|\n", + "| 259|413587|\n", + "| 123|413587|\n", + "| 182|413587|\n", + "| 279|413587|\n", + "| 260|413587|\n", + "| 443|413587|\n", + "| 282|413587|\n", + "| 417|413587|\n", + "| 247|413587|\n", + "| 124|413587|\n", + "| 420|413587|\n", + "| 239|413587|\n", + "| 85|413587|\n", + "| 481|413587|\n", + "| 501|413587|\n", + "| 497|413587|\n", + "| 289|413587|\n", + "| 240|413587|\n", + "| 356|413587|\n", + "| 143|413587|\n", + "| 486|413587|\n", + "| 333|413587|\n", + "| 82|413587|\n", + "| 336|413587|\n", + "| 436|413587|\n", + "| 235|413587|\n", + "| 418|413587|\n", + "| 419|413587|\n", + "| 341|413587|\n", + "| 433|413587|\n", + "| 83|413587|\n", + "| 381|413587|\n", + "| 94|413587|\n", + "| 176|413587|\n", + "| 490|413587|\n", + "| 380|413587|\n", + "| 229|413587|\n", + "| 177|413587|\n", + "| 99|413587|\n", + "| 438|413587|\n", + "| 180|413587|\n", + "| 437|413587|\n", + "| 93|413587|\n", + "| 227|413587|\n", + "| 429|413587|\n", + "| 246|413587|\n", + "| 265|413587|\n", + "| 499|413587|\n", + "| 248|413587|\n", + "| 290|413587|\n", + "| 412|413587|\n", + "| 428|413587|\n", + "| 276|413587|\n", + "| 252|413587|\n", + "| 187|413587|\n", + "| 231|413587|\n", + "| 84|413587|\n", + "| 383|413588|\n", + "| 335|413588|\n", + "| 491|413588|\n", + "| 498|413588|\n", + "| 342|413588|\n", + "| 355|413588|\n", + "| 441|413588|\n", + "| 442|413588|\n", + "| 264|413588|\n", + "| 495|413588|\n", + "| 189|413588|\n", + "| 350|413588|\n", + "| 404|413588|\n", + "| 88|413588|\n", + "| 226|413588|\n", + "| 181|413588|\n", + "| 283|413588|\n", + "| 492|413588|\n", + "| 261|413588|\n", + "| 203|413588|\n", + "| 188|413588|\n", + "| 414|413588|\n", + "| 413|413588|\n", + "| 230|413588|\n", + "| 353|413588|\n", + "| 382|413588|\n", + "| 435|413588|\n", + "| 482|413588|\n", + "| 500|413588|\n", + "| 340|413588|\n", + "| 277|413588|\n", + "| 421|413588|\n", + "| 225|413588|\n", + "| 87|413588|\n", + "| 409|413588|\n", + "| 352|413588|\n", + "| 96|413588|\n", + "| 95|413588|\n", + "| 351|413588|\n", + "| 439|413588|\n", + "| 244|413588|\n", + "| 434|413588|\n", + "| 440|413588|\n", + "| 251|413588|\n", + "| 257|413588|\n", + "| 262|413588|\n", + "| 494|413589|\n", + "| 493|413589|\n", + "| 406|413589|\n", + "| 385|413589|\n", + "| 190|413589|\n", + "| 387|413589|\n", + "| 267|413589|\n", + "| 489|413589|\n", + "| 496|413589|\n", + "| 183|413589|\n", + "| 408|413589|\n", + "| 185|413589|\n", + "| 354|413589|\n", + "| 343|413589|\n", + "| 275|413589|\n", + "| 393|413589|\n", + "| 266|413589|\n", + "| 184|413589|\n", + "| 263|413589|\n", + "| 394|413589|\n", + "| 487|413589|\n", + "| 274|413589|\n", + "| 98|413589|\n", + "| 386|413589|\n", + "| 483|413589|\n", + "| 407|413589|\n", + "| 89|413589|\n", + "| 405|413589|\n", + "| 484|413589|\n", + "| 403|413589|\n", + "| 97|413589|\n", + "| 202|413589|\n", + "| 488|413589|\n", + "| 485|413589|\n", + "| 384|413590|\n", + "| 388|413590|\n", + "| 346|413590|\n", + "| 390|413590|\n", + "| 402|413590|\n", + "| 90|413590|\n", + "| 201|413590|\n", + "| 389|413590|\n", + "| 200|413590|\n", + "| 396|413590|\n", + "| 348|413590|\n", + "| 349|413590|\n", + "| 401|413590|\n", + "| 400|413590|\n", + "| 395|413590|\n", + "| 344|413590|\n", + "| 199|413590|\n", + "| 191|413591|\n", + "| 268|413591|\n", + "| 398|413591|\n", + "| 273|413591|\n", + "| 91|413591|\n", + "| 92|413591|\n", + "| 345|413591|\n", + "| 198|413591|\n", + "| 399|413591|\n", + "| 270|413591|\n", + "| 397|413591|\n", + "| 269|413591|\n", + "| 271|413591|\n", + "| 197|413592|\n", + "| 391|413592|\n", + "| 347|413592|\n", + "| 392|413592|\n", + "| 192|413592|\n", + "| 272|413592|\n", + "| 193|413593|\n", + "| 194|413593|\n", + "| 195|413593|\n", + "| 196|413594|\n", + "+-----------+------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ - "# repartitioning to 600 partitions, seems to be balanced now. \n", + "# repartitioning to 600 partitions, seems to be balanced now with each partition under 2GB as well. \n", "df_all = df_all.repartition(600)\n", - "#displaypartitions(df_all)" + "displaypartitions(df_all)" ] }, { @@ -855,7 +1502,7 @@ "metadata": {}, "outputs": [], "source": [ - "# take a sample to test these operations out on first\n", + "# we initially had taken a sample to test these operations out on first\n", "#sample_df = df_all.sample(fraction=1/1000000)\n", "\n", "# get only the columns needed for the model\n", @@ -868,7 +1515,7 @@ "id": "bdccfb9c-6f47-4d6a-a42b-ad7a1f707f4a", "metadata": {}, "source": [ - "**Daily counts for each community area**" + "### Daily counts for each community area" ] }, { @@ -876,7 +1523,7 @@ "id": "d23b5405-6491-43ed-90fb-cf7aa2056553", "metadata": {}, "source": [ - "We had to group by pickup area and dropoff area seperately then sum to create daily counts of number of trips to that particular community area when it was either a pickup or dropoff area" + "We group by pickup area and dropoff area seperately then sum to create daily counts of number of trips to that particular community area when it was either a pickup or dropoff area" ] }, { @@ -960,8 +1607,6 @@ "# Group by day, month, year, and area, summing up the counts\n", "daily_counts_by_area = combined_df.groupby('day', 'month', 'year', 'area').sum('pickup_count', 'dropoff_count')\n", "\n", - "# the relatively smaller numbers are mostly a result of the sample size, should be fine when we \n", - "# make it to the entire dataframe\n", "daily_counts_by_area = daily_counts_by_area.withColumn('total_counts', F.col('sum(pickup_count)') + F.col('sum(dropoff_count)'))\n", "daily_counts_by_area.drop('sum(pickup_count)','sum(dropoff_count)')\n", "daily_counts_by_area.show(10)" @@ -1023,8 +1668,6 @@ } ], "source": [ - "#keep datetime as part of the pivot. \n", - "\n", "# pivot so that each community area is a column\n", "# one row for each day, each column represents a community area (with its entry being daily count of rides for that area).\n", "\n", @@ -1040,6 +1683,8 @@ "id": "a0df4c64-d1e4-48d1-a5fe-ec6bf18f65fe", "metadata": {}, "source": [ + "### Adding in weather dataset\n", + "\n", "Read in weather data, merge with rideshare data" ] }, @@ -1170,12 +1815,11 @@ } ], "source": [ + "# keeping only relevant weather variables\n", "df_weather = df_weather.select('name', 'datetime', 'temp','precip','snow','snowdepth','sunset')\n", "df_weather = df_weather.withColumn('sunset', F.concat(F.hour(df_weather.sunset),F.minute(df_weather.sunset)))\n", "df_weather.printSchema()\n", - "pivoted_df.printSchema()\n", - "# name, datetime, temp, precip, snow, snowdepth, sunset.\n", - "# merge on datetime- keep datetime as part of the pivot. " + "pivoted_df.printSchema()" ] }, { @@ -1381,7 +2025,7 @@ "id": "4e6dadc9-db41-4492-adaa-be633c0c4afa", "metadata": {}, "source": [ - "# ML Model\n", + "## ML Model\n", "\n", "1. Create Datasets that are for data pre-program (Oct 2021) and for data between Oct 2021 up to not including july 2023.\n", "2. Train model on first dataset. predict for october, november, december 2021\n", @@ -1468,20 +2112,12 @@ "## Impact of Program on rides in Hyde Park" ] }, - { - "cell_type": "markdown", - "id": "47d25a14-27b8-4290-abe7-3e5d8411a739", - "metadata": {}, - "source": [ - "This model will be for predicting the first policy change using df_1" - ] - }, { "cell_type": "markdown", "id": "1884f85e-70c8-4d83-8d5c-f1a90e7b7cee", "metadata": {}, "source": [ - "#### Building model to predict rides in program area based on pre-program data" + "### Building model to predict rides in program area based on pre-program data" ] }, { @@ -1489,7 +2125,7 @@ "id": "370b8f4e-f6bc-4245-889d-e1803afed90c", "metadata": {}, "source": [ - "Using df1 (all pre-program data) to predict what would happen to count of rides if the program had not happened at all. Do this by creating the model using only pre-program data. Predict outcomes for future dates and compare to the actual ride counts on those dates." + "This model will be for predicting the first policy change using df_1. Using df1 (all pre-program data) to predict what would happen to count of rides if the program had not happened at all. Do this by creating the model using only pre-program data. Predict outcomes for future dates and compare to the actual ride counts on those dates." ] }, { @@ -1504,7 +2140,6 @@ " '26','27','28','29','30','31','32','33','34','35','36','37','38','40','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64','65','66',\n", " '67','68','69','70','71','72','73','74','75','76','77','temp','precip','snow','snowdepth','sunset']\n", "\n", - "# adding the handleInvalid = 'skip' allows this to run- double check what it is doing\n", "vectorAssembler = VectorAssembler(inputCols=input_features,\n", " outputCol=\"features\", handleInvalid='skip')\n", "\n", @@ -1686,6 +2321,14 @@ "print(\"r2: %.3f\" %r2)" ] }, + { + "cell_type": "markdown", + "id": "81e42c45-e8c6-4d6d-b4fd-6c30b38a026c", + "metadata": {}, + "source": [ + "The R-squared value is high, indicating that the model explains most of the variability in the daily rides data. These statistics suggest that the linear regression model performs quite well in terms of fitting the data and predicting daily ride counts for the program area, although the RMSE and MAE indicate that there are still significant errors in the predictions. " + ] + }, { "cell_type": "code", "execution_count": 30, @@ -1764,7 +2407,6 @@ " '26','27','28','29','30','31','32','33','34','35','36','37','38','40','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64','65','66',\n", " '67','68','69','70','71','72','73','74','75','76','77','temp','precip','snow','snowdepth','sunset']\n", "\n", - "# adding the handleInvalid = 'skip' allows this to run- double check what it is doing\n", "vectorAssembler = VectorAssembler(inputCols=input_features, outputCol=\"features\", handleInvalid='skip')\n", "\n", "# take the real data and create predictions to compare\n", @@ -2045,7 +2687,7 @@ "source": [ "combined_df['year_month'] = pd.to_datetime(combined_df['year_month'], format='%Y-%m')\n", "\n", - "# Filter the DataFrame to exclude the year 2020\n", + "# Filter the DataFrame to exclude the year 2020 since we don't keep it in our dataframe due to COVID\n", "df_2018_2019 = combined_df[(combined_df['year_month'].dt.year >= 2018) & (combined_df['year_month'].dt.year <= 2019)]\n", "df_2021 = combined_df[combined_df['year_month'].dt.year == 2021]\n", "\n", @@ -2084,7 +2726,7 @@ "outputs": [], "source": [ "# joining different datasets together\n", - "final_dataset = df_2021 \n" + "final_dataset = df_2021" ] }, { @@ -2092,7 +2734,7 @@ "id": "685b3a94-eede-4b4f-94fc-a1b762aed181", "metadata": {}, "source": [ - "#### Program starts- assessing initial Impact\n", + "### Program starts- assessing initial Impact\n", "\n", "Using the trained model for predicting count of rides in program area if program had not started for the period of Oct 2021- Dec 2021 when in program rides were only on weekends. " ] @@ -2235,14 +2877,6 @@ "combined_df" ] }, - { - "cell_type": "markdown", - "id": "83660511-34ff-4c88-83bc-40ed43c5ceb7", - "metadata": {}, - "source": [ - "Add stuff to make this graph much prettier- title, axis's etc. " - ] - }, { "cell_type": "code", "execution_count": 156, @@ -2438,9 +3072,9 @@ "id": "fb93c332-190e-42fb-bc05-cffb78f0a455", "metadata": {}, "source": [ - "#### Impact of policy expansion:to all days (5pm-4am), 10 rides a month\n", + "### Impact of policy expansion:to all days (5pm-4am), 10 rides a month\n", "\n", - "Use the same model (of pre-program rides) to predict for the entirety of df2 (between Oct 2021- June 2023) when the policy was expanded. " + "Use the same model (of pre-program rides) to predict for the entirety of df2 (between Oct 2021 - June 2023) when the policy was expanded. " ] }, { @@ -2493,7 +3127,6 @@ " '26','27','28','29','30','31','32','33','34','35','36','37','38','40','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64','65','66',\n", " '67','68','69','70','71','72','73','74','75','76','77','temp','precip','snow','snowdepth','sunset']\n", "\n", - "# adding the handleInvalid = 'skip' allows this to run- double check what it is doing\n", "vectorAssembler = VectorAssembler(inputCols=input_features, outputCol=\"features\", handleInvalid='skip')\n", "\n", "# take the real data and create predictions to compare\n", @@ -2512,13 +3145,8 @@ "outputs": [], "source": [ "# now group by month and sum counts and plot\n", - "\n", - "#df_real.withColumn(\"month\", F.format_string(\"%02d\", df_real.month.cast(\"int\")))\n", - "#df_real.select('month').show(5)\n", "monthly_real = df_real.withColumn(\"year_month\", F.concat_ws(\"-\", df_real.year, df_real.month))\n", "monthly_second_preds = df_second_predictions.withColumn(\"year_month\", F.concat_ws(\"-\", df_second_predictions.year, df_second_predictions.month))\n", - "\n", - "#monthly_real.select('year_month').distinct().show(30)\n", "monthly_real = monthly_real.groupBy('year_month').sum('area_sums')\n", "monthly_second_preds = monthly_second_preds.groupby('year_month').sum('prediction')" ] @@ -2728,8 +3356,6 @@ ], "source": [ "combined_df['year_month'] = pd.to_datetime(combined_df['year_month'], format='%Y-%m')\n", - "\n", - "# Plot for 2018-2019\n", "plt.figure(figsize=(12, 6))\n", "sns.lineplot(x='year_month', y='sum(area_sums)', data=combined_df, marker='o', label='Actual Rides with expansion')\n", "sns.lineplot(x='year_month', y='sum(prediction)', data=combined_df, dashes=True, label='Predicted Rides with no expansion')\n", @@ -3043,7 +3669,6 @@ "metadata": {}, "outputs": [], "source": [ - "# adding the handleInvalid = 'skip' allows this to run- double check what it is doing\n", "vectorAssembler = VectorAssembler(inputCols=input_features,\n", " outputCol=\"features\", handleInvalid='skip')\n", "\n", @@ -3216,6 +3841,14 @@ "print(\"r2: %.3f\" %r2)" ] }, + { + "cell_type": "markdown", + "id": "db61ad06-9ff0-4a14-a43a-07376b3b3c15", + "metadata": {}, + "source": [ + "The R-squared value is decent, indicating that the model explains a lot of the variability in the daily rides data. These statistics suggest that the linear regression model performs well in terms of fitting the data and predicting daily ride counts for the program area, although the RMSE and MAE indicate that there are still significant errors in the predictions. " + ] + }, { "cell_type": "code", "execution_count": null, @@ -3670,6 +4303,14 @@ "final_dataset" ] }, + { + "cell_type": "markdown", + "id": "ce92eb1b-0567-4a2f-8374-3ce0533e1e25", + "metadata": {}, + "source": [ + "### Overall Impact of Program from 2021-2023 across program changes" + ] + }, { "cell_type": "code", "execution_count": 165,