diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb new file mode 100644 index 0000000..5353dfe --- /dev/null +++ b/supervised_ml.ipynb @@ -0,0 +1,309 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ffbfcaa5-0116-4997-b3ec-80e493185654", + "metadata": {}, + "source": [ + "# Supervised ML\n", + "\n", + "The goal of this model is to predict the ridership that occurs within the University of Chicago Lyft Program Area. We will do this by using as features the ridership counts of other Chicago community areas, as well as using weather. The labels are the daily ridership counts within the program area.\n", + "\n", + "We will create the model that functions up until the introduction of the University Lyft program and then look at the difference between the predictions and the actual ridership as a rough estimate of the effect of the program on rideshare usage in the area. We will do this by looking at both the change when the program was introduced, as well as when the program was reduced from 10 rides of up to 15 dollars each, to 7 rides up to 10 dollars. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "414550d7-9d43-4f5c-8f75-4756974014af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('spark.stage.maxConsecutiveAttempts', '10'),\n", + " ('spark.dynamicAllocation.minExecutors', '1'),\n", + " ('spark.eventLog.enabled', 'true'),\n", + " ('spark.submit.pyFiles',\n", + " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", + " ('spark.kryoserializer.buffer.max', '2000M'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/bbc53cbc-8fb2-40b7-b9b3-62fefb76a670/spark-job-history'),\n", + " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", + " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", + " ('spark.driver.maxResultSize', '0'),\n", + " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", + " ('spark.sql.autoBroadcastJoinThreshold', '43m'),\n", + " ('spark.ui.filters',\n", + " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", + " ('spark.metrics.namespace',\n", + " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", + " ('spark.executor.memory', '4g'),\n", + " ('spark.dataproc.sql.optimizer.leftsemijoin.conversion.enabled', 'true'),\n", + " ('spark.driver.port', '34671'),\n", + " ('spark.hadoop.hive.execution.engine', 'mr'),\n", + " ('spark.executor.id', 'driver'),\n", + " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", + " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1700503299623_0001'),\n", + " ('spark.app.id', 'application_1700503299623_0001'),\n", + " ('spark.yarn.dist.pyFiles',\n", + " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m'),\n", + " ('spark.yarn.am.attemptFailuresValidityInterval', '1h'),\n", + " ('spark.app.name', 'Spark Updated Conf'),\n", + " ('spark.sql.catalogImplementation', 'hive'),\n", + " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", + " ('spark.yarn.secondary.jars',\n", + " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.repl.local.jars',\n", + " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.driver.host',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m.c.msca-bdp-student-ap.internal'),\n", + " ('spark.sql.cbo.enabled', 'true'),\n", + " ('spark.yarn.historyServer.address',\n", + " 'hub-msca-bdp-dphub-students-abejburton-m:18080'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", + " 'http://hub-msca-bdp-dphub-students-abejburton-m:8088/proxy/application_1700503299623_0001'),\n", + " ('spark.executorEnv.PYTHONPATH',\n", + " '/usr/lib/spark/python/lib/py4j-0.10.9-src.zip:/usr/lib/spark/python/:{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar{{PWD}}/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar{{PWD}}/com.typesafe_config-1.4.2.jar{{PWD}}/org.rocksdb_rocksdbjni-6.29.5.jar{{PWD}}/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar{{PWD}}/com.github.universal-automata_liblevenshtein-3.0.0.jar{{PWD}}/com.google.cloud_google-cloud-storage-2.16.0.jar{{PWD}}/com.navigamez_greex-1.0.jar{{PWD}}/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar{{PWD}}/it.unimi.dsi_fastutil-7.0.12.jar{{PWD}}/org.projectlombok_lombok-1.16.8.jar{{PWD}}/com.google.guava_guava-31.1-jre.jar{{PWD}}/com.google.guava_failureaccess-1.0.1.jar{{PWD}}/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar{{PWD}}/com.google.errorprone_error_prone_annotations-2.16.jar{{PWD}}/com.google.j2objc_j2objc-annotations-1.3.jar{{PWD}}/com.google.http-client_google-http-client-1.42.3.jar{{PWD}}/io.opencensus_opencensus-contrib-http-util-0.31.1.jar{{PWD}}/com.google.http-client_google-http-client-jackson2-1.42.3.jar{{PWD}}/com.google.http-client_google-http-client-gson-1.42.3.jar{{PWD}}/com.google.api-client_google-api-client-2.1.1.jar{{PWD}}/commons-codec_commons-codec-1.15.jar{{PWD}}/com.google.oauth-client_google-oauth-client-1.34.1.jar{{PWD}}/com.google.http-client_google-http-client-apache-v2-1.42.3.jar{{PWD}}/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar{{PWD}}/com.google.code.gson_gson-2.10.jar{{PWD}}/com.google.cloud_google-cloud-core-2.9.0.jar{{PWD}}/com.google.auto.value_auto-value-annotations-1.10.1.jar{{PWD}}/com.google.cloud_google-cloud-core-http-2.9.0.jar{{PWD}}/com.google.http-client_google-http-client-appengine-1.42.3.jar{{PWD}}/com.google.api_gax-httpjson-0.105.1.jar{{PWD}}/com.google.cloud_google-cloud-core-grpc-2.9.0.jar{{PWD}}/io.grpc_grpc-core-1.51.0.jar{{PWD}}/com.google.api_gax-2.20.1.jar{{PWD}}/com.google.api_gax-grpc-2.20.1.jar{{PWD}}/io.grpc_grpc-alts-1.51.0.jar{{PWD}}/io.grpc_grpc-grpclb-1.51.0.jar{{PWD}}/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar{{PWD}}/io.grpc_grpc-protobuf-1.51.0.jar{{PWD}}/com.google.auth_google-auth-library-credentials-1.13.0.jar{{PWD}}/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar{{PWD}}/com.google.api_api-common-2.2.2.jar{{PWD}}/javax.annotation_javax.annotation-api-1.3.2.jar{{PWD}}/io.opencensus_opencensus-api-0.31.1.jar{{PWD}}/io.grpc_grpc-context-1.51.0.jar{{PWD}}/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar{{PWD}}/com.google.protobuf_protobuf-java-3.21.10.jar{{PWD}}/com.google.protobuf_protobuf-java-util-3.21.10.jar{{PWD}}/com.google.api.grpc_proto-google-common-protos-2.11.0.jar{{PWD}}/org.threeten_threetenbp-1.6.4.jar{{PWD}}/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.fasterxml.jackson.core_jackson-core-2.14.1.jar{{PWD}}/com.google.code.findbugs_jsr305-3.0.2.jar{{PWD}}/io.grpc_grpc-api-1.51.0.jar{{PWD}}/io.grpc_grpc-auth-1.51.0.jar{{PWD}}/io.grpc_grpc-stub-1.51.0.jar{{PWD}}/org.checkerframework_checker-qual-3.28.0.jar{{PWD}}/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar{{PWD}}/io.grpc_grpc-protobuf-lite-1.51.0.jar{{PWD}}/com.google.android_annotations-4.1.1.4.jar{{PWD}}/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar{{PWD}}/io.grpc_grpc-netty-shaded-1.51.0.jar{{PWD}}/io.perfmark_perfmark-api-0.26.0.jar{{PWD}}/io.grpc_grpc-googleapis-1.51.0.jar{{PWD}}/io.grpc_grpc-xds-1.51.0.jar{{PWD}}/io.opencensus_opencensus-proto-0.2.0.jar{{PWD}}/io.grpc_grpc-services-1.51.0.jar{{PWD}}/com.google.re2j_re2j-1.6.jar{{PWD}}/dk.brics.automaton_automaton-1.11-8.jar{{PWD}}/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.yarn.dist.jars',\n", + " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", + " ('spark.dataproc.sql.parquet.enableFooterCache', 'true'),\n", + " ('spark.driver.memory', '4g'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/bbc53cbc-8fb2-40b7-b9b3-62fefb76a670/spark-job-history'),\n", + " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", + " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", + " ('spark.yarn.am.memory', '640m'),\n", + " ('spark.cores.max', '4'),\n", + " ('spark.executor.cores', '4'),\n", + " ('spark.jars.packages',\n", + " 'com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0,graphframes:graphframes:0.8.2-spark3.1-s_2.12'),\n", + " ('spark.executor.instances', '2'),\n", + " ('spark.dataproc.listeners',\n", + " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", + " ('spark.driver.appUIAddress',\n", + " 'http://hub-msca-bdp-dphub-students-abejburton-m.c.msca-bdp-student-ap.internal:38579'),\n", + " ('spark.serializer.objectStreamReset', '100'),\n", + " ('spark.submit.deployMode', 'client'),\n", + " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", + " ('spark.shuffle.service.enabled', 'true'),\n", + " ('spark.app.startTime', '1700503719000'),\n", + " ('spark.scheduler.mode', 'FAIR'),\n", + " ('spark.sql.adaptive.enabled', 'true'),\n", + " ('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),\n", + " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", + " ('spark.master', 'yarn'),\n", + " ('spark.ui.port', '0'),\n", + " ('spark.rpc.message.maxSize', '512'),\n", + " ('spark.rdd.compress', 'True'),\n", + " ('spark.task.maxFailures', '10'),\n", + " ('spark.yarn.isPython', 'true'),\n", + " ('spark.dynamicAllocation.enabled', 'true'),\n", + " ('spark.ui.showConsoleProgress', 'true')]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# read in packages create spark environment\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import functions as F\n", + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "spark = SparkSession.builder.appName('supervised').getOrCreate()\n", + "\n", + "#change configuration settings on Spark \n", + "conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])\n", + "\n", + "#print spark configuration settings\n", + "spark.sparkContext.getConf().getAll()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8bfe115e-abb4-4a36-8508-1bd17ce2c55c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-------------------+-------------------+-------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+-----+------------+----+---+\n", + "| ID| start_timestamp| end_timestamp|seconds|miles|pickup_tract|dropoff_tract|pickup_area|dropoff_area|Fare|Tip|total| pickup_lat| pickup_lon| dropoff_lat| dropoff_lon|month|day_of_month|hour|day|\n", + "+--------------------+-------------------+-------------------+-------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+-----+------------+----+---+\n", + "|625e77ae6e0ff7191...|2018-11-06 19:00:00|2018-11-06 19:15:00| 1142| 5.8| 17031063400| 17031010400| 6| 1|12.5| 0| 15.0|41.9346591566|-87.6467297286| 42.004764559| -87.659122427| 11| 6| 19| 3|\n", + "|62945fdb2e70957f0...|2018-11-06 19:00:00|2018-11-06 19:00:00| 341| 1.2| 17031081800| 17031833000| 8| 28| 5.0| 0| 7.5|41.8932163595|-87.6378442095|41.8852813201|-87.6572331997| 11| 6| 19| 3|\n", + "|6dc03f91e4480d237...|2018-11-06 19:00:00|2018-11-06 19:00:00| 558| 1.2| 17031070400| 17031061500| 7| 6| 7.5| 0| 10.3|41.9289672664|-87.6561568309|41.9452823311|-87.6615450961| 11| 6| 19| 3|\n", + "|773894079a526afa1...|2018-11-06 19:00:00|2018-11-06 19:30:00| 1047| 2.8| 17031832200| 17031062100| 22| 6|10.0| 2| 14.5|41.9204515116|-87.6799547678|41.9426918444|-87.6517705068| 11| 6| 19| 3|\n", + "|7acf0a7f2edfbe546...|2018-11-06 19:00:00|2018-11-06 19:00:00| 502| 1.3| 17031839100| 17031081700| 32| 8| 2.5| 0| 5.0|41.8809944707|-87.6327464887|41.8920421365|-87.6318639497| 11| 6| 19| 3|\n", + "+--------------------+-------------------+-------------------+-------+-----+------------+-------------+-----------+------------+----+---+-----+-------------+--------------+-------------+--------------+-----+------------+----+---+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "# read in rideshare data for all years, concatenate, create appropriate partitioning\n", + "# we are dropping 2020 because covid will affect the performance of our model\n", + "df_2018 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2018.csv\", inferSchema=True, header=True)\n", + "df_2019 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2019.csv\", inferSchema=True, header=True)\n", + "df_2021 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2021.csv\", inferSchema=True, header=True)\n", + "df_2022 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/rides_2022.csv\", inferSchema=True, header=True)\n", + "df_all = df_2018.union(df_2019).union(df_2021).union(df_2022)\n", + "df_all.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9c7c7fa9-7a39-46eb-93fd-c7006d01c03e", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: REPARTITION\n", + "\n", + "# we will need a year column in this model\n", + "df_all = df_all.withColumn('year', F.year(df_all.start_timestamp))" + ] + }, + { + "cell_type": "markdown", + "id": "59c15480-30c0-48c8-b9a5-6aa122ff1325", + "metadata": {}, + "source": [ + "## Notes for Harsh:\n", + "\n", + "I'm assuming we are predicting using the full dataset and not restricting ourselves to being within the program hours.\n", + "\n", + "I started writing code that goes through the steps that I think will probably be necessary. The code is unfinished because I ran out of time to test in all or formally think through the problems I was seeing. Feel free to change things or make your own assumptions.\n", + "\n", + "Here is the process that I was thinking of. I was trying all this on a sample dataframe so I could code faster.\n", + "1. Get Daily counts for each community area\n", + "2. pivot so that there is a column for each community area (y is when hyde park or woodlawn or kenwood are 1, otherwise the column is a feature)\n", + "3. merge with daily weather data\n", + "4. separate out y (counts for every day in program area) and X (column of counts for each community area outside of the program area)\n", + "5. filter for pre-program rides.\n", + "6. create supervised model on all that data\n", + "7. predict the next month or so of counts after sept 29 2021\n", + "8. Graph predictions versus reality\n", + "9. maybe do the same thing in 2023 once data is available" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "540c7bff-6eac-40c2-a9fe-9b6843f7d546", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 26:=====================================================>(433 + 1) / 434]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+-----+----+-----------+------------+---------+\n", + "|day|month|year|pickup_area|dropoff_area|count(ID)|\n", + "+---+-----+----+-----------+------------+---------+\n", + "| 4| 12|2018| 7| 24| 1|\n", + "| 5| 1|2019| 24| 8| 1|\n", + "| 3| 8|2019| 28| 8| 1|\n", + "| 2| 3|2021| 28| 8| 1|\n", + "| 6| 4|2021| 28| 28| 1|\n", + "+---+-----+----+-----------+------------+---------+\n", + "only showing top 5 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "# take a sample to test these operations out on first\n", + "sample_df = df_all.sample(fraction=1/1000000)\n", + "\n", + "# get only the columns needed for the model\n", + "selected_columns = [\"pickup_area\",\"dropoff_area\",\"day\",\"month\",\"year\",\"ID\"]\n", + "sample_selected = sample_df.select(selected_columns)\n", + "\n", + "\n", + "# group the rideshare data by day and community area and create counts\n", + "sample_df = sample_df.groupby('day',\"month\",\"year\",'pickup_area','dropoff_area').agg({'ID':'count'})\n", + "sample_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a65e426-7aef-4891-942d-538025cd845e", + "metadata": {}, + "outputs": [], + "source": [ + "# the output of the sample df above looks off. investigate\n", + "\n", + "# pivot so that each area is a column\n", + "# should probably create a new variable that denotes in program rides, and figure out what combination of pickup or dropoff area we want to u\n", + "pivoted_df = sample_df.groupBy(\"day\",\"month\",\"year\").pivot(\"dropoff_area\").agg({\"count\": \"first\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1096e22b-90a4-46be-85ad-5ade6572e853", + "metadata": {}, + "outputs": [], + "source": [ + "# read in weather data, merge with rideshare data" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PySpark", + "language": "python", + "name": "pyspark" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}