diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb index 9a0821e..56f2d79 100644 --- a/supervised_ml.ipynb +++ b/supervised_ml.ipynb @@ -21,51 +21,51 @@ { "data": { "text/plain": [ - "[('spark.stage.maxConsecutiveAttempts', '10'),\n", + "[('spark.app.startTime', '1700954655159'),\n", " ('spark.dynamicAllocation.minExecutors', '1'),\n", - " ('spark.eventLog.enabled', 'true'),\n", + " ('spark.stage.maxConsecutiveAttempts', '10'),\n", " ('spark.submit.pyFiles',\n", " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.ui.proxyBase', '/proxy/application_1700881147707_0001'),\n", - " ('spark.driver.host',\n", - " 'hub-msca-bdp-dphub-students-test-harshpachisia-m.c.msca-bdp-student-ap.internal'),\n", + " ('spark.eventLog.enabled', 'true'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/a9d11564-c085-445e-af02-531d1b67672c/spark-job-history'),\n", " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", " ('spark.sql.autoBroadcastJoinThreshold', '191m'),\n", - " ('spark.eventLog.dir',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/6f963ebc-55a3-483f-8657-eb388c77cf9d/spark-job-history'),\n", - " ('spark.app.id', 'application_1700881147707_0001'),\n", + " ('spark.yarn.historyServer.address',\n", + " 'hub-msca-bdp-dphub-students-test-abejburton-m:18080'),\n", " ('spark.kryoserializer.buffer.max', '2000M'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/a9d11564-c085-445e-af02-531d1b67672c/spark-job-history'),\n", " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", " ('spark.driver.maxResultSize', '0'),\n", " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", " ('spark.ui.filters',\n", " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", - " ('spark.driver.appUIAddress',\n", - " 'http://hub-msca-bdp-dphub-students-test-harshpachisia-m.c.msca-bdp-student-ap.internal:43227'),\n", " ('spark.metrics.namespace',\n", " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", " ('spark.executor.memory', '4g'),\n", " ('spark.dataproc.sql.optimizer.leftsemijoin.conversion.enabled', 'true'),\n", " ('spark.hadoop.hive.execution.engine', 'mr'),\n", " ('spark.executor.id', 'driver'),\n", + " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", + " 'hub-msca-bdp-dphub-students-test-abejburton-m'),\n", + " ('spark.driver.host',\n", + " 'hub-msca-bdp-dphub-students-test-abejburton-m.c.msca-bdp-student-ap.internal'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", + " 'hub-msca-bdp-dphub-students-test-abejburton-m'),\n", " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", " ('spark.yarn.dist.pyFiles',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.driver.port', '37523'),\n", - " ('spark.yarn.historyServer.address',\n", - " 'hub-msca-bdp-dphub-students-test-harshpachisia-m:18080'),\n", " ('spark.yarn.am.attemptFailuresValidityInterval', '1h'),\n", " ('spark.app.name', 'Spark Updated Conf'),\n", " ('spark.sql.catalogImplementation', 'hive'),\n", - " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", - " 'hub-msca-bdp-dphub-students-test-harshpachisia-m'),\n", + " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", " ('spark.yarn.secondary.jars',\n", " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", - " 'http://hub-msca-bdp-dphub-students-test-harshpachisia-m:8088/proxy/application_1700881147707_0001'),\n", - " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", + " 'http://hub-msca-bdp-dphub-students-test-abejburton-m:8088/proxy/application_1700954437945_0001'),\n", " ('spark.repl.local.jars',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.sql.cbo.enabled', 'true'),\n", @@ -77,21 +77,18 @@ " ('spark.driver.memory', '4g'),\n", " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1700954437945_0001'),\n", " ('spark.yarn.am.memory', '640m'),\n", - " ('spark.history.fs.logDirectory',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/6f963ebc-55a3-483f-8657-eb388c77cf9d/spark-job-history'),\n", " ('spark.cores.max', '4'),\n", " ('spark.executor.cores', '4'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", - " 'hub-msca-bdp-dphub-students-test-harshpachisia-m'),\n", " ('spark.jars.packages',\n", " 'com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0,graphframes:graphframes:0.8.2-spark3.1-s_2.12'),\n", - " ('spark.app.startTime', '1700882226475'),\n", " ('spark.executor.instances', '2'),\n", " ('spark.dataproc.listeners',\n", " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", " ('spark.serializer.objectStreamReset', '100'),\n", " ('spark.submit.deployMode', 'client'),\n", + " ('spark.app.id', 'application_1700954437945_0001'),\n", " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", " ('spark.shuffle.service.enabled', 'true'),\n", " ('spark.scheduler.mode', 'FAIR'),\n", @@ -100,8 +97,11 @@ " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", " ('spark.master', 'yarn'),\n", " ('spark.ui.port', '0'),\n", + " ('spark.driver.appUIAddress',\n", + " 'http://hub-msca-bdp-dphub-students-test-abejburton-m.c.msca-bdp-student-ap.internal:46861'),\n", " ('spark.rpc.message.maxSize', '512'),\n", " ('spark.rdd.compress', 'True'),\n", + " ('spark.driver.port', '44505'),\n", " ('spark.task.maxFailures', '10'),\n", " ('spark.yarn.isPython', 'true'),\n", " ('spark.dynamicAllocation.enabled', 'true'),\n", @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, "id": "8bfe115e-abb4-4a36-8508-1bd17ce2c55c", "metadata": {}, "outputs": [ @@ -787,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 3, "id": "abf8091a-9662-4378-8fe5-b2ece46a6a14", "metadata": {}, "outputs": [], @@ -799,7 +799,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 4, "id": "9c7c7fa9-7a39-46eb-93fd-c7006d01c03e", "metadata": {}, "outputs": [], @@ -848,33 +848,9 @@ "df_all.printSchema()" ] }, - { - "cell_type": "markdown", - "id": "59c15480-30c0-48c8-b9a5-6aa122ff1325", - "metadata": {}, - "source": [ - "## Next steps\n", - "\n", - "I'm assuming we are predicting using the full dataset and not restricting ourselves to being within the program hours.\n", - "\n", - "I started writing code that goes through the steps that I think will probably be necessary. The code is unfinished because I ran out of time to test in all or formally think through the problems I was seeing. Feel free to change things or make your own assumptions.\n", - "\n", - "Here is the process that I was thinking of. I was trying all this on a sample dataframe so I could code faster.\n", - "1. Get Daily counts for each community area\n", - "2. pivot so that there is a column for each community area (y is when hyde park or woodlawn or kenwood are 1, otherwise the column is a feature)\n", - "3. merge with daily weather data\n", - "4. separate out y (counts for every day in program area) and X (column of counts for each community area outside of the program area)\n", - "5. filter for pre-program rides.\n", - "- Research which model works best and which one is the most parallelizable\n", - "6. create supervised model on all that data\n", - "7. predict the next month or so of counts after sept 29 2021\n", - "8. Graph predictions versus reality\n", - "9. maybe do the same thing in 2023 once data is available" - ] - }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 5, "id": "540c7bff-6eac-40c2-a9fe-9b6843f7d546", "metadata": {}, "outputs": [], @@ -905,7 +881,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 6, "id": "14b0b19a-36ad-4e17-bf30-1e9fcdaea452", "metadata": {}, "outputs": [], @@ -917,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 7, "id": "388446e2-2cd4-4bb8-bfa0-f204b1359427", "metadata": {}, "outputs": [], @@ -929,7 +905,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 8, "id": "3e028095-2ff1-4e5f-aa03-7dafe417168e", "metadata": {}, "outputs": [ @@ -937,7 +913,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 413:================================================> (132 + 10) / 142]\r" + "[Stage 26:==================================================> (95 + 9) / 104]\r" ] }, { @@ -947,16 +923,16 @@ "+---+-----+----+----+-----------------+------------------+------------+\n", "|day|month|year|area|sum(pickup_count)|sum(dropoff_count)|total_counts|\n", "+---+-----+----+----+-----------------+------------------+------------+\n", - "| 6| 11|2018| 40| 4020| 3638| 7658|\n", - "| 2| 11|2018| 67| 2938| 2666| 5604|\n", - "| 2| 11|2018| 29| 5383| 5348| 10731|\n", - "| 6| 12|2018| 43| 9437| 8841| 18278|\n", - "| 7| 2|2019| 28| 119193| 125444| 244637|\n", - "| 5| 1|2019| 24| 81650| 75753| 157403|\n", - "| 4| 8|2019| 71| 4619| 4354| 8973|\n", - "| 6| 5|2019| 25| 15544| 14777| 30321|\n", - "| 7| 2|2019| 43| 10001| 9494| 19495|\n", - "| 6| 4|2019| 48| 1595| 1655| 3250|\n", + "| 6| 7|2022| 8| 167867| 172693| 340560|\n", + "| 4| 8|2022| 6| 40523| 39734| 80257|\n", + "| 7| 11|2022| 42| 7351| 7392| 14743|\n", + "| 7| 12|2022| 76| 18491| 29572| 48063|\n", + "| 6| 1|2022| 76| 13224| 18168| 31392|\n", + "| 7| 11|2022| 73| 2196| 2170| 4366|\n", + "| 2| 1|2022| 1| 6929| 7019| 13948|\n", + "| 4| 8|2022| 56| 9706| 11844| 21550|\n", + "| 1| 12|2022| 5| 7162| 7406| 14568|\n", + "| 5| 4|2022| 1| 7085| 6183| 13268|\n", "+---+-----+----+----+-----------------+------------------+------------+\n", "only showing top 10 rows\n", "\n" @@ -1001,7 +977,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 9, "id": "d57f2088-9ca6-4cc8-a8fe-0cca835fecf9", "metadata": {}, "outputs": [ @@ -1009,6 +985,7 @@ "name": "stderr", "output_type": "stream", "text": [ + "23/11/25 23:32:49 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", " \r" ] }, @@ -1019,15 +996,15 @@ "+---+-----+----+-----+-----+-----+-----+-----+------+------+------+----+----+----+----+----+-----+-----+-----+----+----+-----+----+-----+-----+-----+------+-----+----+-----+------+-----+-----+-----+------+-----+-----+-----+----+----+-----+-----+----+-----+-----+-----+-----+----+----+----+----+-----+----+----+----+----+----+---+-----+----+----+----+-----+----+----+----+----+----+-----+----+-----+-----+----+-----+----+----+----+----+-----+-----+\n", "|day|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 37| 38| 39| 40| 41| 42| 43| 44| 45| 46| 47| 48| 49| 50| 51| 52| 53| 54| 55| 56| 57| 58| 59| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 74| 75| 76| 77|\n", "+---+-----+----+-----+-----+-----+-----+-----+------+------+------+----+----+----+----+----+-----+-----+-----+----+----+-----+----+-----+-----+-----+------+-----+----+-----+------+-----+-----+-----+------+-----+-----+-----+----+----+-----+-----+----+-----+-----+-----+-----+----+----+----+----+-----+----+----+----+----+----+---+-----+----+----+----+-----+----+----+----+----+----+-----+----+-----+-----+----+-----+----+----+----+----+-----+-----+\n", - "| 2| 7|2021| 9005| 7260|17397| 7642| 7999| 47541| 33715|117551| 521|2660|2410|1168|1936| 5671| 5780| 8063|2010| 913| 6787|2356| 7659|21888| 7997| 36890|12275|3120| 4611| 53566| 7252| 5243| 8074| 64260|16000| 3827| 5687|1408| 671| 5674| 3668|2866| 9282| 5301| 9215| 6354|1275|3264| 461|1337| 4831|1325|1588| 472|2478| 835|255|11814|1417|3425|1885| 4057|4469|1062|2325| 917|2063| 4523|3978| 4877| 6248|2526| 5897|1059|2890| 387|1875|36306|12384|\n", "| 5| 10|2021|12199| 9107|24884|10934|12710| 62904| 55515|180042| 674|3314|3259|1323|3077| 7226| 7209|10819|2898|1116| 8828|3030|11178|34577|10307| 59723|14084|3590| 5678| 99556| 8672| 7147|12736|104048|24877| 3684| 7049|1553| 824| 6866| 5464|3602|21490| 8095|10986| 8677|1848|3884| 560|2047| 6449|2113|2097| 622|2949| 898|326|15899|2444|4881|2795| 5363|6014|1560|3048|1163|2742| 5662|4618| 5772| 7720|3439| 7108|1518|3534| 550|2453|47198|16199|\n", + "| 2| 7|2021| 9005| 7260|17397| 7642| 7999| 47541| 33715|117551| 521|2660|2410|1168|1936| 5671| 5780| 8063|2010| 913| 6787|2356| 7659|21888| 7997| 36890|12275|3120| 4611| 53566| 7252| 5243| 8074| 64260|16000| 3827| 5687|1408| 671| 5674| 3668|2866| 9282| 5301| 9215| 6354|1275|3264| 461|1337| 4831|1325|1588| 472|2478| 835|255|11814|1417|3425|1885| 4057|4469|1062|2325| 917|2063| 4523|3978| 4877| 6248|2526| 5897|1059|2890| 387|1875|36306|12384|\n", "| 2| 7|2023| 452| 347| 900| 431| 381| 2452| 1759| 6299| 27| 131| 135| 59| 113| 331| 309| 460| 114| 44| 345| 130| 398| 1214| 403| 1923| 633| 156| 200| 3404| 379| 264| 452| 4042| 1131| 196| 349| 62| 47| 280| 231| 167| 632| 336| 452| 372| 84| 165| 37| 90| 317| 86| 114| 23| 157| 38| 17| 755| 83| 200| 121| 206| 244| 55| 149| 54| 99| 233| 220| 282| 355| 136| 342| 74| 202| 24| 119| 1999| 624|\n", - "| 6| 1|2022|13941|10289|29467|13586|15451| 94575| 80801|210493|1000|3892|3486|1586|2974| 8731| 9212|13110|3319|1318|10251|3698|13416|46027|12440| 79758|17915|4286| 6360|118913|10588| 8039|15913| 93354|22282| 4688| 8157|1862|1026| 8014| 6155|4489|25584| 9771|13227|10423|2183|4636| 739|2299| 7620|2338|2513| 752|3682|1190|487|10984|2607|5532|3128| 6166|6861|1899|3610|1470|3237| 7135|5899| 7034| 9444|3664| 9049|1921|4355| 746|2923|31392|19582|\n", "| 4| 12|2022|13534|10789|26607|13908|14898| 73450| 56285|192604|1031|4105|3962|1865|3414| 8913| 9367|13438|3610|1451|10446|3656|13370|41366|12193| 65188|17799|4458| 7188|124088|11197| 8620|15505|123219|29952| 5589| 8510|2051|1159| 9143| 6917|4444|25007|10275|14128|11708|2262|5106| 862|2495| 8716|2723|2908| 753|4145|1309|439|17321|2865|5458|3348| 5973|7057|1971|3514|1572|3215| 7133|5790| 7826| 9908|4047| 9907|2026|4732| 695|3250|49998|18831|\n", - "| 1| 8|2022|13296| 9439|32329|13087|15270|105985| 73132|228688|1446|4394|3636|1833|2587| 9013| 8194|13321|3025|1106| 9205|3234|14227|46734|10686| 76607|14257|3440| 5457|101705| 7272| 7232|16326|101827|32238| 8668| 8570|1524| 928| 6713| 5401|3969|17821| 7527|11246| 7882|1540|3706| 520|1768| 6252|1636|1870| 700|3050| 903|280|22217|1966|4391|2438| 6302|5531|1702|3129|1605|2887| 5693|4800| 5282| 7928|2816| 7186|1695|3311| 955|2488|55147|20351|\n", + "| 6| 1|2022|13941|10289|29467|13586|15451| 94575| 80801|210493|1000|3892|3486|1586|2974| 8731| 9212|13110|3319|1318|10251|3698|13416|46027|12440| 79758|17915|4286| 6360|118913|10588| 8039|15913| 93354|22282| 4688| 8157|1862|1026| 8014| 6155|4489|25584| 9771|13227|10423|2183|4636| 739|2299| 7620|2338|2513| 752|3682|1190|487|10984|2607|5532|3128| 6166|6861|1899|3610|1470|3237| 7135|5899| 7034| 9444|3664| 9049|1921|4355| 746|2923|31392|19582|\n", "| 7| 1|2022|18806|12136|42743|19325|22814|172792|143120|352533|1718|5127|4171|2165|3057|11874|10871|16888|3747|1437|11811|4244|18677|72617|14263|127816|19298|4748| 6853|141436|10020| 8756|20224|120688|34626| 7479| 9024|2030|1129| 8667| 6848|4660|28835|11337|14475|10306|2194|4961| 667|2345| 7874|2120|2467| 710|3958|1205|346| 9587|2493|5522|3424| 7502|6996|1920|3879|1636|3363| 7526|6009| 6965|10058|3664| 9436|2334|4479|1115|3095|27717|26732|\n", "| 4| 7|2023| 463| 365| 965| 463| 444| 2559| 1962| 7393| 34| 127| 131| 59| 112| 341| 302| 429| 97| 56| 403| 132| 443| 1310| 433| 2120| 759| 192| 267| 4459| 391| 336| 526| 4889| 1197| 239| 321| 56| 46| 344| 243| 175| 792| 343| 541| 363| 83| 191| 31| 84| 375| 132| 92| 28| 155| 41| 13| 700| 90| 197| 120| 223| 256| 69| 149| 41| 109| 247| 257| 267| 370| 152| 375| 74| 194| 25| 124| 1821| 609|\n", "| 2| 12|2018|19802|15539|43393|21356|23301|117250| 90763|301912|1100|4603|4795|2425|4764|12802|11522|19919|4383|1636|14022|5417|21532|68183|16957|110900|18190|4607| 8161|171992|12190|11649|23754|198173|29469| 8323|13998|2705|1938|12021|10056|5742|27483|10491|15387|11743|2257|5055| 782|2719| 8513|2142|2348| 713|3407| 864|430|21408|3815|8522|4970|11152|9706|2631|5835|2237|4789| 8749|6969| 8467|11051|4597| 9713|2362|4425| 929|3084|51232|28031|\n", + "| 1| 8|2022|13296| 9439|32329|13087|15270|105985| 73132|228688|1446|4394|3636|1833|2587| 9013| 8194|13321|3025|1106| 9205|3234|14227|46734|10686| 76607|14257|3440| 5457|101705| 7272| 7232|16326|101827|32238| 8668| 8570|1524| 928| 6713| 5401|3969|17821| 7527|11246| 7882|1540|3706| 520|1768| 6252|1636|1870| 700|3050| 903|280|22217|1966|4391|2438| 6302|5531|1702|3129|1605|2887| 5693|4800| 5282| 7928|2816| 7186|1695|3311| 955|2488|55147|20351|\n", "| 3| 5|2023|16807|13341|33578|15704|15697| 86879| 63584|227565| 929|4833|4777|2111|4279|10515|10783|15338|4314|2029|13129|4497|14692|42292|15713| 69735|23553|6249| 9546|138643|14677|11315|17235|155372|44067| 9692|12508|2625|1687|11341| 9104|6239|43562|16333|18605|14355|2980|6865|1210|3395|11945|3708|3893|1018|5759|1775|633|24394|3771|7014|4114| 7801|9179|2355|5080|2115|4261| 9532|8674|10757|14098|5573|14056|2570|6896| 842|4693|71329|22182|\n", "| 3| 4|2021|10752| 7700|17863| 8097| 8166| 44654| 31723| 88534| 462|2339|2907| 982|2451| 6301| 6522| 8472|2233|1073| 8403|2923| 8328|22146|10603| 36694|18019|4651| 6295| 52198|10596| 6663| 9319| 47537|11252| 3260| 7161|1718| 942| 7237| 4423|3662|11469| 7158|12603| 8795|1757|4851| 675|2046| 7168|1849|2450| 576|3634|1059|260| 7632|1939|4179|2405| 4556|6223|1398|2987|1097|2590| 6248|5597| 7130| 8685|3223| 8347|1322|3891| 432|2563|15590|12931|\n", "| 5| 6|2023|18451|14352|36525|18935|20186|117204| 90040|294778|1256|5320|5346|2493|4144|11275|11802|17490|4709|1878|13671|5098|17893|55472|17084| 92824|26242|6684|10091|173452|15695|11370|20425|191290|51359| 7999|12958|2871|1842|12451| 9709|6780|35207|14715|20536|15518|3404|7328|1271|3774|12771|4065|4041|1064|6282|1765|679|30124|3856|6929|4332| 8925|9870|2420|4868|2239|4358|10023|8958|11184|15503|5661|14911|2880|7158|1015|4907|82087|25320|\n", @@ -1036,8 +1013,8 @@ "| 6| 6|2021|12371| 8913|24889|11603|13836| 86502| 71364|199110| 988|3458|3230|1648|2548| 7349| 7707|11729|2636|1114| 8371|3000|11113|39300|10637| 72459|16010|4238| 5986| 91405| 9266| 6328|12768| 85373|22234| 5523| 7431|1743| 965| 7506| 5138|3736|15434| 7379|11968| 8401|1773|4176| 562|1845| 6354|1716|2192| 532|3307| 970|295|11939|2007|4147|2444| 5603|5676|1520|2841|1217|2649| 6014|5326| 6131| 8192|3181| 8166|1683|3668| 530|2565|32995|17873|\n", "| 5| 7|2019|19329|14198|47419|22225|26378|133951|124257|379055|1178|5080|4944|2693|4217|11916|11382|20303|4299|1760|13068|5321|23236|77784|16224|134828|18868|4342| 8478|207091|12300|11208|24766|241322|48453|11700|16106|2933|1651|12022| 9580|5574|27064|10211|15156|11706|2465|5030| 743|2539| 8769|2205|2576| 883|3502| 863|433|24153|3681|7892|4599|11856|9344|2679|5295|2469|4613| 8268|6481| 8123|10662|4611| 9820|2479|4919| 921|3009|62043|30475|\n", "| 3| 11|2022|15880|12131|31078|14865|15863| 73919| 60160|214459| 959|4542|4438|1943|4045| 9870|10023|14586|4147|1719|11962|4314|14547|42118|14140| 67054|20024|4836| 7883|133125|12585|10273|16786|148457|35501| 5557|10284|2489|1280|10137| 8355|5071|39935|14530|16406|13048|2599|5804| 962|2849| 9968|3281|3300| 941|4640|1404|545|22147|3389|6356|3749| 6760|8016|2197|4209|1791|3813| 8234|6939| 9057|11424|4844|11041|2208|5570| 876|3765|63512|20986|\n", + "| 2| 2|2021| 9183| 7546|14935| 7747| 7354| 33977| 24718| 67195| 409|2337|2404| 833|2225| 5894| 5945| 7652|2068|1018| 7260|2612| 7590|19048| 8670| 30216|13278|3523| 5011| 40704| 8110| 5834| 8311| 35505| 8889| 1823| 5562|1330| 715| 5581| 3495|3075| 9154| 5433|10297| 7210|1345|3967| 598|1805| 5907|1611|2049| 456|3202| 851|291| 6280|1670|4015|2282| 3874|5551|1243|2706| 980|2302| 5554|4482| 5622| 6722|2786| 6740|1064|3181| 389|2023|14378|10746|\n", "| 4| 2|2023|15458|11260|28204|14529|14833| 74643| 58242|189020| 864|4181|4240|1726|4031| 9422| 9329|13571|3771|1497|11336|4042|14024|40407|13643| 66096|20899|5449| 8260|129142|13489| 9685|14687|133399|24428| 4547| 9962|2307|1329| 9933| 8143|5676|41225|14456|16586|13385|2594|6035|1137|2847|10257|3385|3169| 804|4796|1549|533|15089|3033|5810|3539| 6367|7949|1994|4156|1693|3454| 8092|7348| 9805|12322|4483|11787|2050|5439| 787|3873|46148|19838|\n", - "| 7| 8|2021|13207| 8601|31275|12881|16570|120202| 93666|276189|1505|4087|3534|1915|2381| 7736| 7730|12612|3000|1148| 8430|2967|12685|46681| 9949| 92253|13850|3553| 5375|107196|11734| 6516|16970|106287|36191| 8910| 9610|2405| 906| 6761| 4566|3477|12634| 6054|10065| 7393|1464|3282| 502|1532| 5394|1252|1742| 509|2678| 795|242| 9243|1740|4256|2552| 6477|5170|1405|2820|1161|2612| 5330|4320| 4849| 6729|2833| 6585|1588|2992| 798|2370|26502|20381|\n", "| 6| 2|2021|13911| 9972|22922|11569|11977| 68007| 55550|135886| 760|3083|3241|1270|2707| 7962| 8118|11141|2829|1274| 9956|3524|11010|32483|12558| 59532|20673|5550| 7129| 73369|12222| 7617|12828| 57966|14662| 2978| 8001|1928|1169| 8170| 5241|4483|13561| 8155|14385|10377|2016|5580| 822|2450| 8428|2214|2879| 650|4344|1145|363| 8688|2171|5180|2843| 5245|7489|1767|3403|1369|3128| 7447|6572| 7860|10370|3904| 9873|1713|4660| 554|2927|19113|16875|\n", "+---+-----+----+-----+-----+-----+-----+-----+------+------+------+----+----+----+----+----+-----+-----+-----+----+----+-----+----+-----+-----+-----+------+-----+----+-----+------+-----+-----+-----+------+-----+-----+-----+----+----+-----+-----+----+-----+-----+-----+-----+----+----+----+----+-----+----+----+----+----+----+---+-----+----+----+----+-----+----+----+----+----+----+-----+----+-----+-----+----+-----+----+----+----+----+-----+-----+\n", "only showing top 20 rows\n", @@ -1068,7 +1045,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "b501c963-15c8-4341-b9c6-7d2f07cc5015", "metadata": {}, "outputs": [ @@ -1090,7 +1067,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 11, "id": "2858a6da-ef1b-4561-b8e2-f93073b8e803", "metadata": {}, "outputs": [ @@ -1100,7 +1077,7 @@ "text": [ "root\n", " |-- name: string (nullable = true)\n", - " |-- datetime: date (nullable = true)\n", + " |-- datetime: string (nullable = true)\n", " |-- temp: double (nullable = true)\n", " |-- precip: double (nullable = true)\n", " |-- snow: double (nullable = true)\n", @@ -1228,7 +1205,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 12, "id": "bb54fa3a-399b-4b9d-b67d-6726a362cf9f", "metadata": {}, "outputs": [ @@ -1236,7 +1213,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 538:==========================================> (26 + 8) / 34]\r" + "[Stage 116:==================================================> (578 + 22) / 600]\r" ] }, { @@ -1276,7 +1253,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 13, "id": "4d80ab44-61f4-4ef0-9209-7c05710e9023", "metadata": {}, "outputs": [ @@ -1310,7 +1287,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 14, "id": "408996cf-b9ff-4f5c-b6c8-8a5211eb3a2c", "metadata": {}, "outputs": [ @@ -1349,7 +1326,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 15, "id": "e325a6ba-0463-43db-8275-5708fb3817bc", "metadata": {}, "outputs": [ @@ -1357,7 +1334,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 696:==============================================> (45 + 8) / 53]\r" + " \r" ] }, { @@ -1367,19 +1344,12 @@ "+-----+----+-----+----+-----+-----+-----+-----+-----+------+---+----+----+----+----+----+----+-----+----+----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+-----+------+-----+----+----+----+---+----+----+-----+----+----+----+---+----+----+----+----+---+----+---+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+\n", "|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 37| 38| 40| 43| 44| 45| 46| 47| 48| 49| 50| 51| 52| 53| 54| 55| 56| 57| 58| 59| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 74| 75| 76| 77|area_sums|temp|precip|snow|snowdepth|sunset|\n", "+-----+----+-----+----+-----+-----+-----+-----+-----+------+---+----+----+----+----+----+----+-----+----+----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+-----+------+-----+----+----+----+---+----+----+-----+----+----+----+---+----+----+----+----+---+----+---+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+\n", - "| 10|2021|12199|9107|24884|10934|12710|62904|55515|180042|674|3314|3259|1323|3077|7226|7209|10819|2898|1116|8828|3030|11178|34577|10307|59723|14084|3590|5678|99556|8672|7147|12736|104048|24877|3684|7049|1553|824|6866|3602|10986|8677|1848|3884|560|2047|6449|2113|2097|622|2949|898|326|15899|2444|4881|2795|5363|6014|1560|3048|1163|2742|5662|4618|5772|7720|3439|7108|1518|3534|550|2453|47198|16199| 35049|66.6| 0.0| 0.0| 0.0| 1824|\n", "| 7|2021| 9005|7260|17397| 7642| 7999|47541|33715|117551|521|2660|2410|1168|1936|5671|5780| 8063|2010| 913|6787|2356| 7659|21888| 7997|36890|12275|3120|4611|53566|7252|5243| 8074| 64260|16000|3827|5687|1408|671|5674|2866| 9215|6354|1275|3264|461|1337|4831|1325|1588|472|2478|835|255|11814|1417|3425|1885|4057|4469|1062|2325| 917|2063|4523|3978|4877|6248|2526|5897|1059|2890|387|1875|36306|12384| 18251|66.6| 0.0| 0.0| 0.0| 2029|\n", + "| 10|2021|12199|9107|24884|10934|12710|62904|55515|180042|674|3314|3259|1323|3077|7226|7209|10819|2898|1116|8828|3030|11178|34577|10307|59723|14084|3590|5678|99556|8672|7147|12736|104048|24877|3684|7049|1553|824|6866|3602|10986|8677|1848|3884|560|2047|6449|2113|2097|622|2949|898|326|15899|2444|4881|2795|5363|6014|1560|3048|1163|2742|5662|4618|5772|7720|3439|7108|1518|3534|550|2453|47198|16199| 35049|66.6| 0.0| 0.0| 0.0| 1824|\n", "+-----+----+-----+----+-----+-----+-----+-----+-----+------+---+----+----+----+----+----+----+-----+----+----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+-----+------+-----+----+----+----+---+----+----+-----+----+----+----+---+----+----+----+----+---+----+---+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+\n", "only showing top 2 rows\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] } ], "source": [ @@ -1389,7 +1359,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 16, "id": "a2e0d72f-0152-4cf8-83b4-777997cbac56", "metadata": {}, "outputs": [ @@ -1403,7 +1373,7 @@ ], "source": [ "# write completed data to a GCS bucket so we don't have to rerun things\n", - "merged_df.write.option(\"header\", \"true\").csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/supervised_dataset.csv\")" + "merged_df.write.option(\"header\", \"true\").csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/supervised_dataset_final.csv\")" ] }, { @@ -1411,11 +1381,14 @@ "id": "4e6dadc9-db41-4492-adaa-be633c0c4afa", "metadata": {}, "source": [ - "# ML Model Psuedo Code\n", + "# ML Model\n", "\n", - "Assumes a dataframe df_merged where the the columns are daily program ride counts followed by daily ride counts for other community areas. Then the daily weather columns with datetime dropped after the merge. \n", - "\n", - "I folowed fairly closely the ML from one of Ashish's notebooks. I didn't do cross validation yet because I wanted to know how long it takes to run a single regression. " + "1. Create Datasets that are for data pre-program (Oct 2021) and for data between Oct 2021 up to not including july 2023.\n", + "2. Get Cross Validated Model Running\n", + "3. Train model on first dataset. predict for october, november, december 2021\n", + "4. plot predictions (dotted line for predictions, solid line for actual)\n", + "5. train new model on second dataset\n", + "6. plot for july, \n" ] }, { @@ -1434,12 +1407,41 @@ ], "source": [ "# skip all the above and just run this line to get the final dataset loaded in to use with the ML model\n", - "merged_df = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/supervised_dataset.csv\", inferSchema=True, header=True)" + "merged_df = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/rideshare/processed_data/supervised_dataset_final.csv\", inferSchema=True, header=True)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, + "id": "fe839d92-0f0e-4b33-8a0a-a3e48ff8b0d7", + "metadata": {}, + "outputs": [], + "source": [ + "# this is the data for predicting the first policy change\n", + "df_1 = merged_df.filter((merged_df.year < 2021) | ((merged_df.year == 2021) & (merged_df.month < 10))) \n", + "# this is the data for predicting the second policy change\n", + "df_2 = merged_df.filter(((merged_df.year == 2021) & (merged_df.month >= 10)) | (merged_df.year == 2022) | ((merged_df.year == 2023) & (merged_df.month < 7)))\n", + "# this is the data after the second policy change\n", + "df_3 = merged_df.filter(((merged_df.year == 2023) & (merged_df.month >= 7)))" + ] + }, + { + "cell_type": "markdown", + "id": "49227805-612c-4ff6-b714-74f2de65f556", + "metadata": {}, + "source": [ + "## I'm trying to organize my thoughts about prediction here, hopefully this makes some sense.\n", + "\n", + "The way we've been thinking about this model is that it is predicting daily counts, so our predictions should be daily as well.\n", + "So I think I should take the rows from df2 that are in 10,11,12 and make predictions for each one. put those in a dataframe. Group by month and summ predicted rides.\n", + "Then take the actual sums from df2, group by month and sum. Plot those against each other\n", + "\n", + "I'm going to work on doing this without cross validation first, and then move to cross validation depending on time." + ] + }, + { + "cell_type": "code", + "execution_count": 19, "id": "975dadc7-cb0c-4e75-89fb-b73b8e7ba743", "metadata": {}, "outputs": [], @@ -1454,7 +1456,37 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, + "id": "ba889428-ef35-43d5-91d3-e49f1ef7bcad", + "metadata": {}, + "outputs": [], + "source": [ + "# make sure sunset is an integer so it works in the model\n", + "df_1 = df_1.withColumn(\"sunset\", F.col(\"sunset\").cast(\"int\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "8682ec74-9d84-454e-980c-074da8e08284", + "metadata": {}, + "outputs": [], + "source": [ + "df_2 = df_2.withColumn(\"sunset\", F.col(\"sunset\").cast(\"int\"))\n", + "df_3 = df_3.withColumn(\"sunset\", F.col(\"sunset\").cast(\"int\"))" + ] + }, + { + "cell_type": "markdown", + "id": "47d25a14-27b8-4290-abe7-3e5d8411a739", + "metadata": {}, + "source": [ + "This model will be for predicting the first policy change using df_1" + ] + }, + { + "cell_type": "code", + "execution_count": 26, "id": "e0b37ea7-3f52-4698-8d63-114d69164046", "metadata": {}, "outputs": [], @@ -1472,83 +1504,71 @@ "#train_df, test_df = df.randomSplit([0.7, 0.3],0.0)\n", "\n", "# splitting first and then doing vector assembly to avoid errors\n", - "train_df, test_df = merged_df.randomSplit([.7,.3],seed=1234)\n", - "train_df = vectorAssembler.transform(merged_df)\n", - "test_df = vectorAssembler.transform(merged_df)" + "train_df, test_df = df_1.randomSplit([.7,.3],seed=1234)\n", + "\n", + "# originally harsh had the vector assembler transforming merged_df which I think was a mistake\n", + "train_df = vectorAssembler.transform(train_df)\n", + "test_df = vectorAssembler.transform(test_df)" ] }, { "cell_type": "code", - "execution_count": 38, - "id": "d8f33707-8e4b-423f-a9f3-ae1a86396725", + "execution_count": 27, + "id": "bd8be7aa-1cc7-4753-bf08-6142797364c3", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "+-----+----+-----+-----+-----+-----+-----+------+------+------+----+----+----+----+----+-----+-----+-----+----+----+-----+----+-----+-----+-----+------+-----+----+----+------+-----+-----+-----+------+-----+-----+-----+----+----+-----+----+-----+-----+----+----+---+----+-----+----+----+---+----+----+---+-----+----+----+----+-----+-----+----+----+----+----+----+----+-----+-----+----+-----+----+----+----+----+-----+-----+---------+----+------+----+---------+------+--------------------+\n", - "|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 37| 38| 40| 43| 44| 45| 46| 47| 48| 49| 50| 51| 52| 53| 54| 55| 56| 57| 58| 59| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 74| 75| 76| 77|area_sums|temp|precip|snow|snowdepth|sunset| features|\n", - "+-----+----+-----+-----+-----+-----+-----+------+------+------+----+----+----+----+----+-----+-----+-----+----+----+-----+----+-----+-----+-----+------+-----+----+----+------+-----+-----+-----+------+-----+-----+-----+----+----+-----+----+-----+-----+----+----+---+----+-----+----+----+---+----+----+---+-----+----+----+----+-----+-----+----+----+----+----+----+----+-----+-----+----+-----+----+----+----+----+-----+-----+---------+----+------+----+---------+------+--------------------+\n", - "| 5|2023|20624|12186|43716|20834|24268|162621|124428|349861|1891|5961|5315|2420|3545|11996|11808|20304|4335|1883|12870|4582|23057|79210|15456|126355|22474|6185|8700|168618|12442| 9944|27138|163969|62533|14016|11585|2357|1704|10542|5737|16982|12335|2561|5906|827|2906|10091|2656|2934|966|5344|1495|451|15716|2894|6264|3833| 9832| 8138|2262|4335|2275|4104|8617|7700| 8655|12326|4414|12336|2659|5940|1177|4281|46058|29906| 69142|70.6| 0.04| 0.0| 0.0| 1955|[5.0,2023.0,20624...|\n", - "| 3|2022|14687|11577|27583|12893|13752| 66236| 53145|183453| 806|4062|4117|1644|3787| 9117| 9500|13253|3896|1587|11592|4124|13275|38027|14137| 62550|20880|5178|7796|110517|13492| 9435|14935|127734|29327| 5405| 9674|2186|1296| 9594|5521|15945|12546|2443|5746|984|2473| 9490|2786|3284|727|4526|1455|503|14315|3080|6199|3259| 6588| 8203|2049|4219|1605|3757|8415|7133| 9272|11558|4297|10821|1822|5049| 644|3306|49424|18553| 53400|32.5| 0.0| 0.0| 0.6| 1743|[3.0,2022.0,14687...|\n", - "| 2|2019|20787|15438|40744|21558|24225|111325| 96909|318159| 929|4641|4688|2196|5649|12778|11848|19664|4770|1629|14328|5488|20945|66953|17200|115609|20071|5028|9581|207877|14573|12685|24987|234357|33800| 8021|16379|2872|2127|13472|6667|17059|13425|2581|5697|930|2868|10446|2948|2686|850|3981|1025|502|17966|4362|9076|5138|12068|10928|2810|6052|2339|5122|9382|8360|10490|12661|5026|11440|2133|5427| 779|3490|47135|27578| 62289|43.4| 0.0| 0.0| 0.8| 178|[2.0,2019.0,20787...|\n", - "| 8|2021|10265| 7825|19319| 9169|10210| 53276| 45180|148170| 619|2883|2777|1256|2275| 6134| 6327| 9156|2268|1098| 7498|2560| 8826|27317| 9234| 49074|14017|3608|5455| 75915| 8427| 5848|10170| 82074|20560| 4672| 7423|1523| 845| 6572|3402|10459| 7821|1630|3624|611|1670| 5650|1617|1869|494|2894| 873|268|13781|1950|4050|2232| 4781| 5411|1336|2607|1090|2371|5290|4519| 5513| 6982|2956| 6882|1437|3233| 435|2300|40044|13375| 22360|77.1| 0.0| 0.0| 0.0| 203|[8.0,2021.0,10265...|\n", - "| 11|2021|13303| 8395|28113|11644|13194| 92033| 66378|199483|1149|3613|3246|1488|2366| 8071| 7243|11571|2521|1004| 8151|2891|12715|41766| 9589| 68719|12257|3070|4350| 87701| 6175| 5995|14830| 88191|27226| 5414| 5787|1357| 707| 5792|3054| 9727| 6829|1222|3108|397|1429| 4835|1534|1474|469|2458| 746|285|16331|1673|3780|2345| 5188| 4594|1321|2824|1178|2406|4714|4063| 4586| 6621|2254| 5829|1365|2692| 778|2055|48696|18708| 34078|41.5| 0.0| 0.0| 0.0| 1744|[11.0,2021.0,1330...|\n", - "+-----+----+-----+-----+-----+-----+-----+------+------+------+----+----+----+----+----+-----+-----+-----+----+----+-----+----+-----+-----+-----+------+-----+----+----+------+-----+-----+-----+------+-----+-----+-----+----+----+-----+----+-----+-----+----+----+---+----+-----+----+----+---+----+----+---+-----+----+----+----+-----+-----+----+----+----+----+----+----+-----+-----+----+-----+----+----+----+----+-----+-----+---------+----+------+----+---------+------+--------------------+\n", - "only showing top 5 rows\n", - "\n" + "23/11/26 00:20:31 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS\n", + "23/11/26 00:20:31 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS\n", + " \r" ] }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - " \r" + "Coefficients: [-141.17821491897232,-745.3452416033055,0.04069589047839324,0.10523630495219816,0.002416215491090195,0.025009680242684473,0.015991560747858,-0.000809268481126041,-0.0003083326046310907,0.001719524462734702,-0.34154509015643614,-0.08238655427681342,-0.0,-0.0,0.779895783342685,0.05086694893622908,0.025653865341783114,0.024961895150708617,0.19582968856485264,0.2782530128506548,0.05452784089546298,0.17385879886338018,0.0187564547338862,0.003440291549253111,0.050322500767793425,0.0028592082630942497,-0.025445117595504444,0.0,0.14513263303240942,0.010301942556421948,0.02382458946533733,0.13529300450184567,0.0461338629846489,0.009441369903795505,0.01915890689447327,0.030501308888032312,0.16639747544742067,0.372961105529036,1.0630578240307724,0.15210946145832405,0.3574990068792217,-0.009678684217525414,0.05007541974459699,0.3291731701458939,-0.13681731725138238,1.8145586373952591,0.17587067975956647,0.10694274072585269,1.1307091745949909,-0.32397755445680976,0.7275761869829841,0.0,-4.3914935300680344,2.803775923420458,0.06630059546553475,0.6118236814711749,0.2231250018854385,0.3552694333131243,0.11002835175648276,0.19352160818980538,0.5766994392678659,0.34223581743721654,0.22754882093817158,0.3015178177562475,0.06567033036361472,0.05159515210734938,0.14042357457635468,0.05271453557563555,0.36247842456150564,0.0,-0.5013060546057359,0.10421665258028001,-0.4851376212929765,-0.03691767085960801,0.029653477467609077,0.014410898627229617,-18.758338427871184,534.5990989483481,1413.626713599575,-197.24250326523736,1.094024899581077]\n", + "Intercept: 1502177.818453082\n", + "RMSE: 3547.182496\n", + "r2: 0.965341\n" ] - } - ], - "source": [ - "train_df.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "bd8be7aa-1cc7-4753-bf08-6142797364c3", - "metadata": {}, - "outputs": [ + }, { "name": "stderr", "output_type": "stream", "text": [ - " \r" + "[Stage 448:====================================================> (19 + 1) / 20]\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Coefficients: [-66.72841290960203,2096.320335854964,0.07737454071173874,0.035134305792045864,0.02137694958199008,0.0543922732894695,0.020625005325430445,0.0022373637819797197,0.0032653293629435007,0.0037847522795157107,0.04699003465926484,0.2534178310079206,0.2393992387806387,0.2968434471731084,0.6882574683561236,0.09583256594707623,0.06473861640623363,0.04318527565506878,0.5697475959137256,0.574714192173466,0.004413737015360228,0.028274619319269237,0.04111630162648094,0.008932993003971074,-0.013839727437326598,0.0013968887010071549,-0.0964834011376087,-0.30392121119556476,-0.038722182947819254,0.02048313918158278,-0.07767640509988266,0.06659548001348663,0.06074265323601359,0.012613139708165286,0.04981127572149861,0.09864269630223645,-0.015143088242105246,-0.1503136291113701,0.33173139285575554,-0.0172261863312499,0.22590390768630747,-0.023426737812655345,0.11547744828368202,0.5731906872508918,-0.08754040754088893,1.8322526823373078,0.03800847680519966,0.173872133558346,2.0806163697074402,0.5325202252129942,1.6821108759369527,0.30845171318047204,1.5226396725235583,12.347864399088158,0.13106118338815512,0.39207633377214685,-0.04000036888192165,-0.0225008361790106,0.003608515083226439,-0.055959138022776696,-0.05291477415454662,-0.025588456866900068,0.2844497998831903,-0.13434447564990956,-0.14620285259905053,-0.049103510458409334,0.0,0.10826560623822525,0.11727927561902557,0.054669499102892725,0.0,0.2929657321292788,0.6694017600382829,0.5385129182304533,0.13319120464051154,0.032883092922174464,-99.29774235141461,2658.4623994564195,4985.141235046448,-233.6913182121664,2.0731723198515137]\n", - "Intercept: -4244491.790612886\n", - "RMSE: 5753.785752\n", - "r2: 0.906565\nn", - "|features |area_sums|prediction |\nn", - "|[5.0,2023.0,20624.0,12186.0,43716.0,20834.0,24268.0,162621.0,124428.0,349861.0,1891.0,5961.0,5315.0,2420.0,3545.0,11996.0,11808.0,20304.0,4335.0,1883.0,12870.0,4582.0,23057.0,79210.0,15456.0,126355.0,22474.0,6185.0,8700.0,168618.0,12442.0,9944.0,27138.0,163969.0,62533.0,14016.0,11585.0,2357.0,1704.0,10542.0,5737.0,16982.0,12335.0,2561.0,5906.0,827.0,2906.0,10091.0,2656.0,2934.0,966.0,5344.0,1495.0,451.0,15716.0,2894.0,6264.0,3833.0,9832.0,8138.0,2262.0,4335.0,2275.0,4104.0,8617.0,7700.0,8655.0,12326.0,4414.0,12336.0,2659.0,5940.0,1177.0,4281.0,46058.0,29906.0,70.6,0.04,0.0,0.0,1955.0]|69142 |60879.9144059401 |\n", - "|[3.0,2022.0,14687.0,11577.0,27583.0,12893.0,13752.0,66236.0,53145.0,183453.0,806.0,4062.0,4117.0,1644.0,3787.0,9117.0,9500.0,13253.0,3896.0,1587.0,11592.0,4124.0,13275.0,38027.0,14137.0,62550.0,20880.0,5178.0,7796.0,110517.0,13492.0,9435.0,14935.0,127734.0,29327.0,5405.0,9674.0,2186.0,1296.0,9594.0,5521.0,15945.0,12546.0,2443.0,5746.0,984.0,2473.0,9490.0,2786.0,3284.0,727.0,4526.0,1455.0,503.0,14315.0,3080.0,6199.0,3259.0,6588.0,8203.0,2049.0,4219.0,1605.0,3757.0,8415.0,7133.0,9272.0,11558.0,4297.0,10821.0,1822.0,5049.0,644.0,3306.0,49424.0,18553.0,32.5,0.0,0.0,0.6,1743.0] |53400 |51058.53343227692 |\n", - "|[2.0,2019.0,20787.0,15438.0,40744.0,21558.0,24225.0,111325.0,96909.0,318159.0,929.0,4641.0,4688.0,2196.0,5649.0,12778.0,11848.0,19664.0,4770.0,1629.0,14328.0,5488.0,20945.0,66953.0,17200.0,115609.0,20071.0,5028.0,9581.0,207877.0,14573.0,12685.0,24987.0,234357.0,33800.0,8021.0,16379.0,2872.0,2127.0,13472.0,6667.0,17059.0,13425.0,2581.0,5697.0,930.0,2868.0,10446.0,2948.0,2686.0,850.0,3981.0,1025.0,502.0,17966.0,4362.0,9076.0,5138.0,12068.0,10928.0,2810.0,6052.0,2339.0,5122.0,9382.0,8360.0,10490.0,12661.0,5026.0,11440.0,2133.0,5427.0,779.0,3490.0,47135.0,27578.0,43.4,0.0,0.0,0.8,178.0] |62289 |51955.70422972739 |\n", - "|[8.0,2021.0,10265.0,7825.0,19319.0,9169.0,10210.0,53276.0,45180.0,148170.0,619.0,2883.0,2777.0,1256.0,2275.0,6134.0,6327.0,9156.0,2268.0,1098.0,7498.0,2560.0,8826.0,27317.0,9234.0,49074.0,14017.0,3608.0,5455.0,75915.0,8427.0,5848.0,10170.0,82074.0,20560.0,4672.0,7423.0,1523.0,845.0,6572.0,3402.0,10459.0,7821.0,1630.0,3624.0,611.0,1670.0,5650.0,1617.0,1869.0,494.0,2894.0,873.0,268.0,13781.0,1950.0,4050.0,2232.0,4781.0,5411.0,1336.0,2607.0,1090.0,2371.0,5290.0,4519.0,5513.0,6982.0,2956.0,6882.0,1437.0,3233.0,435.0,2300.0,40044.0,13375.0,77.1,0.0,0.0,0.0,203.0] |22360 |21939.092428480275|\n", - "|[11.0,2021.0,13303.0,8395.0,28113.0,11644.0,13194.0,92033.0,66378.0,199483.0,1149.0,3613.0,3246.0,1488.0,2366.0,8071.0,7243.0,11571.0,2521.0,1004.0,8151.0,2891.0,12715.0,41766.0,9589.0,68719.0,12257.0,3070.0,4350.0,87701.0,6175.0,5995.0,14830.0,88191.0,27226.0,5414.0,5787.0,1357.0,707.0,5792.0,3054.0,9727.0,6829.0,1222.0,3108.0,397.0,1429.0,4835.0,1534.0,1474.0,469.0,2458.0,746.0,285.0,16331.0,1673.0,3780.0,2345.0,5188.0,4594.0,1321.0,2824.0,1178.0,2406.0,4714.0,4063.0,4586.0,6621.0,2254.0,5829.0,1365.0,2692.0,778.0,2055.0,48696.0,18708.0,41.5,0.0,0.0,0.0,1744.0] |34078 |32300.32284839358 |\nnn", + "|features |area_sums|prediction |\nn", + "|[7.0,2021.0,9005.0,7260.0,17397.0,7642.0,7999.0,47541.0,33715.0,117551.0,521.0,2660.0,2410.0,1168.0,1936.0,5671.0,5780.0,8063.0,2010.0,913.0,6787.0,2356.0,7659.0,21888.0,7997.0,36890.0,12275.0,3120.0,4611.0,53566.0,7252.0,5243.0,8074.0,64260.0,16000.0,3827.0,5687.0,1408.0,671.0,5674.0,2866.0,9215.0,6354.0,1275.0,3264.0,461.0,1337.0,4831.0,1325.0,1588.0,472.0,2478.0,835.0,255.0,11814.0,1417.0,3425.0,1885.0,4057.0,4469.0,1062.0,2325.0,917.0,2063.0,4523.0,3978.0,4877.0,6248.0,2526.0,5897.0,1059.0,2890.0,387.0,1875.0,36306.0,12384.0,66.6,0.0,0.0,0.0,2029.0] |18251 |18633.118334107567|\n", + "|[11.0,2019.0,31301.0,20571.0,72163.0,34279.0,40909.0,214270.0,184080.0,553240.0,2052.0,7234.0,7293.0,3814.0,6445.0,18618.0,17940.0,30061.0,6311.0,2393.0,20390.0,8016.0,35837.0,124274.0,24830.0,210548.0,28231.0,7284.0,12059.0,313372.0,18903.0,17359.0,40780.0,317356.0,55213.0,15558.0,19109.0,3502.0,2606.0,16664.0,8029.0,23451.0,19175.0,3615.0,8133.0,1312.0,3933.0,13494.0,3497.0,3799.0,1276.0,5579.0,1630.0,890.0,32696.0,5663.0,12046.0,7249.0,17380.0,14692.0,4072.0,8155.0,3444.0,6762.0,13198.0,10621.0,12938.0,17027.0,7086.0,15616.0,3550.0,7423.0,1570.0,4810.0,77674.0,44762.0,40.4,0.003,0.0,0.0,1638.0]|82642 |77658.68311885605 |\n", + "|[11.0,2018.0,24441.0,15075.0,52214.0,23778.0,26500.0,177030.0,132956.0,359952.0,1817.0,5422.0,4540.0,2236.0,3735.0,14001.0,12197.0,21433.0,4340.0,1529.0,13116.0,5356.0,25214.0,92699.0,15159.0,143693.0,16025.0,3772.0,6588.0,157780.0,8952.0,10114.0,25784.0,161527.0,60265.0,12294.0,12096.0,2217.0,1700.0,10215.0,4856.0,14280.0,9544.0,1864.0,4154.0,488.0,2166.0,6643.0,1346.0,1924.0,676.0,2827.0,796.0,276.0,26145.0,2801.0,7425.0,4322.0,11516.0,8106.0,2490.0,4856.0,2313.0,4084.0,7322.0,5504.0,6204.0,8881.0,3517.0,8240.0,2502.0,3849.0,1464.0,2712.0,56067.0,33937.0,49.6,0.0,0.0,0.0,1744.0] |48053 |45113.59325499227 |\n", + "|[4.0,2021.0,13500.0,8516.0,26863.0,12093.0,13898.0,110488.0,87020.0,200307.0,1233.0,3432.0,3362.0,1559.0,2317.0,7982.0,8325.0,12108.0,2869.0,1202.0,9252.0,3202.0,11952.0,44393.0,11736.0,87212.0,18243.0,4644.0,6120.0,86325.0,9252.0,6761.0,13300.0,68326.0,20362.0,4862.0,6875.0,1657.0,924.0,7051.0,3768.0,12655.0,9209.0,1773.0,4661.0,605.0,1917.0,6639.0,1833.0,2033.0,553.0,3401.0,1073.0,257.0,7300.0,1808.0,4263.0,2628.0,5369.0,6158.0,1565.0,3205.0,1223.0,2857.0,6542.0,5606.0,6420.0,8774.0,2882.0,8396.0,1621.0,3721.0,696.0,2779.0,17087.0,18032.0,70.4,0.0,0.0,0.0,1923.0] |25550 |24605.566335394047|\n", + "|[9.0,2019.0,19539.0,13004.0,39713.0,20213.0,25689.0,127372.0,112930.0,355301.0,1189.0,4628.0,4699.0,2400.0,5147.0,11212.0,10800.0,19062.0,4815.0,1505.0,12736.0,4831.0,21310.0,68785.0,15359.0,123804.0,17553.0,4304.0,8337.0,211111.0,12235.0,11235.0,23736.0,244834.0,52115.0,9774.0,14359.0,2475.0,1710.0,11178.0,5128.0,14788.0,11767.0,2449.0,4980.0,875.0,2536.0,8806.0,2437.0,2584.0,913.0,3575.0,829.0,568.0,28341.0,4058.0,7966.0,4721.0,11188.0,9751.0,2651.0,4954.0,2468.0,4630.0,8627.0,6540.0,8567.0,10951.0,4686.0,9570.0,2237.0,4867.0,870.0,2990.0,72793.0,26796.0,64.9,0.0,0.0,0.0,1917.0] |46530 |53888.15131104388 |\nn", "only showing top 5 rows\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] } ], "source": [ - "# i'm not sure if train_df and test_df is right here, look into this\n", "train_df = train_df.na.drop() # Remove rows with null values\n", "\n", "# Train Model\n", @@ -1573,7 +1593,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 29, "id": "e904f1bc-2143-439c-a6fd-e30ef8cfac28", "metadata": {}, "outputs": [ @@ -1588,7 +1608,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "RMSE: 5753.786\n" + "RMSE: 4090.238\n" ] }, { @@ -1602,7 +1622,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "MSE: 33106050.484\n" + "MSE: 16730047.318\n" ] }, { @@ -1616,21 +1636,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "MAE: 4626.275\n" + "MAE: 3102.571\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "[Stage 65:===========================================> (6 + 2) / 8]\r" + "[Stage 538:===================================================> (392 + 8) / 400]\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "r2: 0.907\n" + "r2: 0.950\n" ] }, { @@ -1662,6 +1682,162 @@ "print(\"r2: %.3f\" %r2)" ] }, + { + "cell_type": "code", + "execution_count": 30, + "id": "296059d9-78af-4ac7-b53a-eee5fad98434", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "# save model\n", + "model_path = \"gs://msca-bdp-student-gcs/bdp-rideshare-project/models/pre_program_model\"\n", + "lrm.save(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "8fecbaee-05af-4446-8809-89207ec651c4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "textn", + "|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 37| 38| 40| 43| 44| 45| 46| 47| 48| 49| 50| 51| 52| 53| 54| 55| 56| 57| 58| 59| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 74| 75| 76| 77|area_sums|temp|precip|snow|snowdepth|sunset| features| prediction|\n", + "+-----+----+-----+-----+-----+-----+-----+-----+-----+------+----+----+----+----+----+----+----+-----+----+----+-----+----+-----+-----+-----+-----+-----+----+----+------+-----+----+-----+------+-----+----+----+----+----+----+----+-----+-----+----+----+---+----+----+----+----+---+----+----+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+-----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+--------------------+------------------+\n", + "| 12|2021|15334|11839|30581|14637|15938|84279|72666|232096|1016|4185|4262|1698|3693|9655|9731|14002|3570|1432|11428|4051|14444|44767|13778|75868|19517|4816|7073|118158|11234|9203|17681|126362|27419|5278|9174|2017|1188|9343|4944|14861|11699|2349|5020|848|2660|8718|2824|2832|644|4265|1237|468|17543|3150|6327|3647|6606|7771|2008|4083|1536|3817|7780|6562|8032|10688|4158|10085|1988|4732|744|3144|54636|21202| 44651|38.5| 0.193| 0.0| 0.0| 1619|[12.0,2021.0,1533...| 37059.28123582038|\n", + "| 12|2021|12578| 8378|26474|11249|12867|87859|63301|192686|1144|3749|2977|1382|2170|7802|6958|11260|2455| 925| 7978|2855|11668|38279| 9505|64532|12372|3070|4251| 80562| 6243|6018|12932| 86370|27059|4747|5384|1316| 746|5475|3034| 9741| 6728|1242|3073|417|1476|4879|1568|1490|477|2489| 685|241|13603|1581|3740|2192|4932|4336|1325|2606|1195|2415|4725|3882|4391| 6555|2298| 6047|1317|2653|792|2003|40337|18290| 27989|43.2| 0.0| 0.0| 0.0| 1620|[12.0,2021.0,1257...|21605.461612534244|\n", + "+-----+----+-----+-----+-----+-----+-----+-----+-----+------+----+----+----+----+----+----+----+-----+----+----+-----+----+-----+-----+-----+-----+-----+----+----+------+-----+----+-----+------+-----+----+----+----+----+----+----+-----+-----+----+----+---+----+----+----+----+---+----+----+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+-----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+--------------------+------------------+\n", + "only showing top 2 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+----+-----+-----+-----+-----+-----+-----+-----+------+----+----+----+----+----+----+----+-----+----+----+-----+----+-----+-----+-----+-----+-----+----+----+------+-----+----+-----+------+-----+----+----+----+----+----+----+-----+-----+----+----+---+----+----+----+----+---+----+----+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+-----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+\n", + "|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 37| 38| 40| 43| 44| 45| 46| 47| 48| 49| 50| 51| 52| 53| 54| 55| 56| 57| 58| 59| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 74| 75| 76| 77|area_sums|temp|precip|snow|snowdepth|sunset|\n", + "+-----+----+-----+-----+-----+-----+-----+-----+-----+------+----+----+----+----+----+----+----+-----+----+----+-----+----+-----+-----+-----+-----+-----+----+----+------+-----+----+-----+------+-----+----+----+----+----+----+----+-----+-----+----+----+---+----+----+----+----+---+----+----+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+-----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+\n", + "| 12|2021|15334|11839|30581|14637|15938|84279|72666|232096|1016|4185|4262|1698|3693|9655|9731|14002|3570|1432|11428|4051|14444|44767|13778|75868|19517|4816|7073|118158|11234|9203|17681|126362|27419|5278|9174|2017|1188|9343|4944|14861|11699|2349|5020|848|2660|8718|2824|2832|644|4265|1237|468|17543|3150|6327|3647|6606|7771|2008|4083|1536|3817|7780|6562|8032|10688|4158|10085|1988|4732|744|3144|54636|21202| 44651|38.5| 0.193| 0.0| 0.0| 1619|\n", + "| 12|2021|12578| 8378|26474|11249|12867|87859|63301|192686|1144|3749|2977|1382|2170|7802|6958|11260|2455| 925| 7978|2855|11668|38279| 9505|64532|12372|3070|4251| 80562| 6243|6018|12932| 86370|27059|4747|5384|1316| 746|5475|3034| 9741| 6728|1242|3073|417|1476|4879|1568|1490|477|2489| 685|241|13603|1581|3740|2192|4932|4336|1325|2606|1195|2415|4725|3882|4391| 6555|2298| 6047|1317|2653|792|2003|40337|18290| 27989|43.2| 0.0| 0.0| 0.0| 1620|\n", + "+-----+----+-----+-----+-----+-----+-----+-----+-----+------+----+----+----+----+----+----+----+-----+----+----+-----+----+-----+-----+-----+-----+-----+----+----+------+-----+----+-----+------+-----+----+----+----+----+----+----+-----+-----+----+----+---+----+----+----+----+---+----+----+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+-----+----+----+---+----+-----+-----+---------+----+------+----+---------+------+\n", + "only showing top 2 rows\n", + "\n" + ] + } + ], + "source": [ + "# dataframe that is the true counts\n", + "df_real = df_2.filter(df_2.year == 2021)\n", + "\n", + "# take the real data and create predictions to compare\n", + "df_real_vector = vectorAssembler.transform(df_real)\n", + "df_first_predictions = lrm.transform(df_real_vector)\n", + "\n", + "df_first_predictions.show(2)\n", + "df_real.show(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "0c752d74-61ab-4c16-a1f6-62a5eebd3963", + "metadata": {}, + "outputs": [], + "source": [ + "# now group by month and sum counts and plot\n", + "monthly_real = df_real.groupBy('month').sum('area_sums')\n", + "monthly_first_preds = df_first_predictions.groupby('month').sum('prediction')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "2aeece88-97ec-495b-b0fd-f26720250497", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+--------------+\n", + "|month|sum(area_sums)|\n", + "+-----+--------------+\n", + "| 12| 258103|\n", + "| 11| 275975|\n", + "+-----+--------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+------------------+\n", + "|month| sum(prediction)|\n", + "+-----+------------------+\n", + "| 12|207578.42360277753|\n", + "| 11| 201316.3117565771|\n", + "+-----+------------------+\n", + "\n" + ] + } + ], + "source": [ + "monthly_real.show()\n", + "monthly_first_preds.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dacd81e-05ea-4243-8370-b6b6b56fd75a", + "metadata": {}, + "outputs": [], + "source": [ + "monthly_real_pd = monthly_real.toPandas()\n", + "monthly_first_preds_pd = monthly_first_preds.toPandas()\n" + ] + }, { "cell_type": "markdown", "id": "4a8f0db1-4c39-4a3f-a340-303747b3ae29",