diff --git a/supervised_ml.ipynb b/supervised_ml.ipynb index 8d8c884..542c051 100644 --- a/supervised_ml.ipynb +++ b/supervised_ml.ipynb @@ -26,7 +26,6 @@ " ('spark.eventLog.enabled', 'true'),\n", " ('spark.submit.pyFiles',\n", " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.driver.port', '41723'),\n", " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", " ('spark.kryoserializer.buffer.max', '2000M'),\n", " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", @@ -37,10 +36,13 @@ " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", " ('spark.metrics.namespace',\n", " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/52d961bd-30af-4617-a5c7-e483b5a04872/spark-job-history'),\n", " ('spark.executor.memory', '4g'),\n", " ('spark.dataproc.sql.optimizer.leftsemijoin.conversion.enabled', 'true'),\n", " ('spark.hadoop.hive.execution.engine', 'mr'),\n", " ('spark.executor.id', 'driver'),\n", + " ('spark.app.startTime', '1700841609746'),\n", " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", " ('spark.yarn.historyServer.address',\n", @@ -52,17 +54,15 @@ " ('spark.sql.catalogImplementation', 'hive'),\n", " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", " 'hub-msca-bdp-dphub-students-harshpachisia-m'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/52d961bd-30af-4617-a5c7-e483b5a04872/spark-job-history'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1700841465032_0001'),\n", " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", " ('spark.yarn.secondary.jars',\n", " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.repl.local.jars',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.sql.cbo.enabled', 'true'),\n", - " ('spark.app.startTime', '1700688666130'),\n", - " ('spark.history.fs.logDirectory',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/746d6c22-b8c9-4995-8216-9ead3917af24/spark-job-history'),\n", - " ('spark.driver.appUIAddress',\n", - " 'http://hub-msca-bdp-dphub-students-harshpachisia-m.c.msca-bdp-student-ap.internal:40715'),\n", " ('spark.executorEnv.PYTHONPATH',\n", " '/usr/lib/spark/python/lib/py4j-0.10.9-src.zip:/usr/lib/spark/python/:{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar{{PWD}}/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar{{PWD}}/com.typesafe_config-1.4.2.jar{{PWD}}/org.rocksdb_rocksdbjni-6.29.5.jar{{PWD}}/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar{{PWD}}/com.github.universal-automata_liblevenshtein-3.0.0.jar{{PWD}}/com.google.cloud_google-cloud-storage-2.16.0.jar{{PWD}}/com.navigamez_greex-1.0.jar{{PWD}}/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar{{PWD}}/it.unimi.dsi_fastutil-7.0.12.jar{{PWD}}/org.projectlombok_lombok-1.16.8.jar{{PWD}}/com.google.guava_guava-31.1-jre.jar{{PWD}}/com.google.guava_failureaccess-1.0.1.jar{{PWD}}/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar{{PWD}}/com.google.errorprone_error_prone_annotations-2.16.jar{{PWD}}/com.google.j2objc_j2objc-annotations-1.3.jar{{PWD}}/com.google.http-client_google-http-client-1.42.3.jar{{PWD}}/io.opencensus_opencensus-contrib-http-util-0.31.1.jar{{PWD}}/com.google.http-client_google-http-client-jackson2-1.42.3.jar{{PWD}}/com.google.http-client_google-http-client-gson-1.42.3.jar{{PWD}}/com.google.api-client_google-api-client-2.1.1.jar{{PWD}}/commons-codec_commons-codec-1.15.jar{{PWD}}/com.google.oauth-client_google-oauth-client-1.34.1.jar{{PWD}}/com.google.http-client_google-http-client-apache-v2-1.42.3.jar{{PWD}}/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar{{PWD}}/com.google.code.gson_gson-2.10.jar{{PWD}}/com.google.cloud_google-cloud-core-2.9.0.jar{{PWD}}/com.google.auto.value_auto-value-annotations-1.10.1.jar{{PWD}}/com.google.cloud_google-cloud-core-http-2.9.0.jar{{PWD}}/com.google.http-client_google-http-client-appengine-1.42.3.jar{{PWD}}/com.google.api_gax-httpjson-0.105.1.jar{{PWD}}/com.google.cloud_google-cloud-core-grpc-2.9.0.jar{{PWD}}/io.grpc_grpc-core-1.51.0.jar{{PWD}}/com.google.api_gax-2.20.1.jar{{PWD}}/com.google.api_gax-grpc-2.20.1.jar{{PWD}}/io.grpc_grpc-alts-1.51.0.jar{{PWD}}/io.grpc_grpc-grpclb-1.51.0.jar{{PWD}}/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar{{PWD}}/io.grpc_grpc-protobuf-1.51.0.jar{{PWD}}/com.google.auth_google-auth-library-credentials-1.13.0.jar{{PWD}}/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar{{PWD}}/com.google.api_api-common-2.2.2.jar{{PWD}}/javax.annotation_javax.annotation-api-1.3.2.jar{{PWD}}/io.opencensus_opencensus-api-0.31.1.jar{{PWD}}/io.grpc_grpc-context-1.51.0.jar{{PWD}}/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar{{PWD}}/com.google.protobuf_protobuf-java-3.21.10.jar{{PWD}}/com.google.protobuf_protobuf-java-util-3.21.10.jar{{PWD}}/com.google.api.grpc_proto-google-common-protos-2.11.0.jar{{PWD}}/org.threeten_threetenbp-1.6.4.jar{{PWD}}/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.fasterxml.jackson.core_jackson-core-2.14.1.jar{{PWD}}/com.google.code.findbugs_jsr305-3.0.2.jar{{PWD}}/io.grpc_grpc-api-1.51.0.jar{{PWD}}/io.grpc_grpc-auth-1.51.0.jar{{PWD}}/io.grpc_grpc-stub-1.51.0.jar{{PWD}}/org.checkerframework_checker-qual-3.28.0.jar{{PWD}}/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar{{PWD}}/io.grpc_grpc-protobuf-lite-1.51.0.jar{{PWD}}/com.google.android_annotations-4.1.1.4.jar{{PWD}}/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar{{PWD}}/io.grpc_grpc-netty-shaded-1.51.0.jar{{PWD}}/io.perfmark_perfmark-api-0.26.0.jar{{PWD}}/io.grpc_grpc-googleapis-1.51.0.jar{{PWD}}/io.grpc_grpc-xds-1.51.0.jar{{PWD}}/io.opencensus_opencensus-proto-0.2.0.jar{{PWD}}/io.grpc_grpc-services-1.51.0.jar{{PWD}}/com.google.re2j_re2j-1.6.jar{{PWD}}/dk.brics.automaton_automaton-1.11-8.jar{{PWD}}/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.driver.host',\n", @@ -78,31 +78,31 @@ " ('spark.executor.cores', '4'),\n", " ('spark.jars.packages',\n", " 'com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0,graphframes:graphframes:0.8.2-spark3.1-s_2.12'),\n", + " ('spark.app.id', 'application_1700841465032_0001'),\n", + " ('spark.driver.appUIAddress',\n", + " 'http://hub-msca-bdp-dphub-students-harshpachisia-m.c.msca-bdp-student-ap.internal:39869'),\n", " ('spark.executor.instances', '2'),\n", " ('spark.dataproc.listeners',\n", " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", " ('spark.sql.autoBroadcastJoinThreshold', '90m'),\n", " ('spark.serializer.objectStreamReset', '100'),\n", " ('spark.submit.deployMode', 'client'),\n", + " ('spark.driver.port', '40089'),\n", " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", " ('spark.shuffle.service.enabled', 'true'),\n", - " ('spark.app.id', 'application_1700688540052_0001'),\n", " ('spark.scheduler.mode', 'FAIR'),\n", - " ('spark.ui.proxyBase', '/proxy/application_1700688540052_0001'),\n", " ('spark.sql.adaptive.enabled', 'true'),\n", " ('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),\n", " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", + " 'http://hub-msca-bdp-dphub-students-harshpachisia-m:8088/proxy/application_1700841465032_0001'),\n", " ('spark.master', 'yarn'),\n", " ('spark.ui.port', '0'),\n", - " ('spark.eventLog.dir',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/746d6c22-b8c9-4995-8216-9ead3917af24/spark-job-history'),\n", " ('spark.rpc.message.maxSize', '512'),\n", " ('spark.rdd.compress', 'True'),\n", " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", " 'hub-msca-bdp-dphub-students-harshpachisia-m'),\n", " ('spark.task.maxFailures', '10'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", - " 'http://hub-msca-bdp-dphub-students-harshpachisia-m:8088/proxy/application_1700688540052_0001'),\n", " ('spark.yarn.isPython', 'true'),\n", " ('spark.dynamicAllocation.enabled', 'true'),\n", " ('spark.ui.showConsoleProgress', 'true')]" @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "8bfe115e-abb4-4a36-8508-1bd17ce2c55c", "metadata": {}, "outputs": [ @@ -188,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "18e30586-4bdd-4217-b55d-e41522df062b", "metadata": {}, "outputs": [ @@ -196,14 +196,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Partitions: 534\n" + "Partitions: 544\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "[Stage 31:=====================================================>(532 + 2) / 534]\r" + "[Stage 32:=====================================================>(543 + 1) / 544]\r" ] }, { @@ -213,307 +213,327 @@ "+-----------+------+\n", "|partitionId| count|\n", "+-----------+------+\n", - "| 33|152646|\n", - "| 233|328837|\n", - "| 232|328975|\n", - "| 231|329131|\n", - "| 230|329163|\n", - "| 229|329209|\n", - "| 227|329245|\n", - "| 228|329263|\n", - "| 225|329263|\n", - "| 224|329311|\n", - "| 226|329315|\n", - "| 222|329332|\n", - "| 223|329344|\n", - "| 221|329373|\n", - "| 218|329389|\n", - "| 219|329390|\n", - "| 217|329399|\n", - "| 216|329410|\n", - "| 215|329410|\n", - "| 214|329418|\n", - "| 220|329427|\n", - "| 213|329428|\n", - "| 210|329461|\n", - "| 212|329481|\n", - "| 211|329505|\n", - "| 207|329507|\n", - "| 208|329513|\n", - "| 209|329519|\n", - "| 206|329523|\n", - "| 204|329533|\n", - "| 203|329555|\n", - "| 205|329574|\n", - "| 201|329587|\n", - "| 202|329591|\n", - "| 198|329607|\n", - "| 200|329623|\n", - "| 196|329624|\n", - "| 199|329630|\n", - "| 197|329633|\n", - "| 195|329646|\n", - "| 192|329654|\n", - "| 194|329673|\n", - "| 193|329678|\n", - "| 184|329704|\n", - "| 191|329708|\n", - "| 190|329712|\n", - "| 181|329717|\n", - "| 179|329728|\n", - "| 178|329730|\n", - "| 189|329732|\n", - "| 183|329732|\n", - "| 188|329739|\n", - "| 180|329746|\n", - "| 185|329748|\n", - "| 187|329786|\n", + "| 42|305254|\n", + "| 41|305316|\n", + "| 40|305420|\n", + "| 38|305471|\n", + "| 39|305480|\n", + "| 37|305618|\n", + "| 36|305676|\n", + "| 35|305871|\n", + "| 34|305890|\n", + "| 33|305962|\n", + "| 32|305971|\n", + "| 31|306010|\n", + "| 29|306031|\n", + "| 30|306038|\n", + "| 28|306086|\n", + "| 27|306127|\n", + "| 26|306402|\n", + "| 25|306467|\n", + "| 24|306633|\n", + "| 23|306731|\n", + "| 22|307226|\n", + "| 243|328837|\n", + "| 242|328975|\n", + "| 241|329131|\n", + "| 240|329163|\n", + "| 239|329209|\n", + "| 237|329245|\n", + "| 235|329263|\n", + "| 238|329263|\n", + "| 234|329311|\n", + "| 236|329315|\n", + "| 232|329332|\n", + "| 233|329344|\n", + "| 231|329373|\n", + "| 228|329389|\n", + "| 229|329390|\n", + "| 227|329399|\n", + "| 226|329410|\n", + "| 225|329410|\n", + "| 224|329418|\n", + "| 230|329427|\n", + "| 223|329428|\n", + "| 220|329461|\n", + "| 222|329481|\n", + "| 221|329505|\n", + "| 217|329507|\n", + "| 218|329513|\n", + "| 219|329519|\n", + "| 216|329523|\n", + "| 214|329533|\n", + "| 213|329555|\n", + "| 215|329574|\n", + "| 211|329587|\n", + "| 212|329591|\n", + "| 208|329607|\n", + "| 210|329623|\n", + "| 206|329624|\n", + "| 209|329630|\n", + "| 207|329633|\n", + "| 205|329646|\n", + "| 202|329654|\n", + "| 204|329673|\n", + "| 203|329678|\n", + "| 194|329704|\n", + "| 201|329708|\n", + "| 200|329712|\n", + "| 191|329717|\n", + "| 189|329728|\n", + "| 188|329730|\n", + "| 193|329732|\n", + "| 199|329732|\n", + "| 198|329739|\n", + "| 190|329746|\n", + "| 195|329748|\n", + "| 197|329786|\n", + "| 196|329786|\n", "| 186|329786|\n", - "| 176|329786|\n", - "| 177|329787|\n", - "| 182|329793|\n", - "| 173|329804|\n", - "| 174|329805|\n", - "| 169|329807|\n", - "| 172|329814|\n", - "| 171|329819|\n", - "| 175|329843|\n", - "| 168|329849|\n", - "| 167|329861|\n", - "| 170|329865|\n", - "| 162|329866|\n", - "| 163|329871|\n", - "| 161|329879|\n", - "| 165|329889|\n", - "| 164|329892|\n", - "| 166|329900|\n", - "| 160|329937|\n", - "| 158|329956|\n", - "| 152|329961|\n", - "| 157|329962|\n", - "| 154|329965|\n", - "| 159|329974|\n", - "| 155|329980|\n", - "| 153|329985|\n", - "| 149|329993|\n", - "| 156|330000|\n", - "| 148|330005|\n", - "| 151|330014|\n", - "| 147|330021|\n", - "| 150|330027|\n", - "| 146|330059|\n", - "| 144|330066|\n", - "| 137|330070|\n", - "| 143|330083|\n", - "| 140|330084|\n", - "| 145|330084|\n", - "| 136|330089|\n", - "| 142|330090|\n", - "| 139|330099|\n", - "| 141|330101|\n", - "| 138|330103|\n", - "| 134|330116|\n", - "| 135|330129|\n", - "| 130|330133|\n", - "| 133|330135|\n", - "| 131|330139|\n", - "| 132|330148|\n", - "| 129|330159|\n", - "| 125|330178|\n", - "| 127|330181|\n", - "| 122|330186|\n", - "| 128|330189|\n", - "| 126|330205|\n", - "| 119|330208|\n", - "| 123|330219|\n", - "| 115|330220|\n", - "| 118|330227|\n", - "| 124|330230|\n", - "| 117|330236|\n", - "| 120|330243|\n", - "| 114|330245|\n", - "| 107|330249|\n", - "| 121|330252|\n", - "| 111|330255|\n", - "| 112|330258|\n", - "| 116|330268|\n", - "| 113|330270|\n", - "| 108|330275|\n", - "| 105|330282|\n", - "| 103|330282|\n", - "| 110|330296|\n", - "| 102|330297|\n", - "| 109|330306|\n", - "| 104|330308|\n", - "| 94|330338|\n", - "| 106|330339|\n", - "| 99|330355|\n", - "| 95|330363|\n", - "| 101|330366|\n", - "| 100|330372|\n", - "| 98|330383|\n", - "| 97|330385|\n", - "| 96|330402|\n", - "| 86|330413|\n", - "| 92|330415|\n", - "| 90|330418|\n", - "| 88|330418|\n", - "| 91|330419|\n", - "| 84|330425|\n", - "| 87|330426|\n", - "| 89|330430|\n", - "| 93|330430|\n", - "| 85|330436|\n", - "| 80|330456|\n", - "| 82|330470|\n", - "| 78|330471|\n", - "| 81|330474|\n", - "| 83|330477|\n", - "| 77|330494|\n", - "| 76|330501|\n", - "| 79|330512|\n", - "| 74|330524|\n", - "| 72|330550|\n", - "| 70|330570|\n", - "| 75|330570|\n", - "| 71|330579|\n", - "| 73|330585|\n", - "| 68|330622|\n", - "| 69|330625|\n", - "| 66|330625|\n", - "| 65|330642|\n", - "| 67|330646|\n", - "| 61|330651|\n", - "| 64|330653|\n", - "| 63|330667|\n", - "| 60|330690|\n", - "| 55|330704|\n", - "| 62|330709|\n", - "| 57|330721|\n", - "| 56|330724|\n", - "| 59|330737|\n", - "| 52|330758|\n", - "| 54|330762|\n", - "| 53|330762|\n", - "| 58|330766|\n", - "| 50|330782|\n", - "| 49|330784|\n", - "| 46|330801|\n", - "| 47|330805|\n", - "| 51|330807|\n", - "| 48|330837|\n", - "| 43|330868|\n", - "| 45|330869|\n", - "| 44|330885|\n", - "| 42|330918|\n", - "| 40|330944|\n", - "| 41|330963|\n", - "| 39|331028|\n", - "| 38|331034|\n", - "| 37|331050|\n", - "| 36|331114|\n", - "| 35|331284|\n", - "| 34|331416|\n", - "| 533|364094|\n", - "| 532|364374|\n", - "| 531|364493|\n", - "| 527|364581|\n", - "| 528|364599|\n", - "| 529|364616|\n", - "| 530|364617|\n", - "| 526|364654|\n", - "| 524|364709|\n", - "| 525|364756|\n", - "| 522|364784|\n", - "| 523|364810|\n", - "| 519|364899|\n", - "| 520|364903|\n", - "| 521|364944|\n", - "| 518|364957|\n", - "| 517|364961|\n", - "| 514|364971|\n", - "| 515|364988|\n", - "| 516|365006|\n", - "| 512|365011|\n", - "| 513|365051|\n", - "| 511|365057|\n", - "| 510|365079|\n", - "| 508|365083|\n", - "| 507|365090|\n", - "| 509|365097|\n", - "| 506|365122|\n", - "| 504|365165|\n", - "| 505|365179|\n", - "| 503|365224|\n", - "| 499|365252|\n", - "| 496|365253|\n", - "| 501|365255|\n", - "| 498|365272|\n", - "| 500|365277|\n", - "| 502|365278|\n", - "| 497|365302|\n", - "| 495|365347|\n", - "| 492|365377|\n", - "| 493|365394|\n", - "| 494|365395|\n", - "| 491|365409|\n", - "| 490|365431|\n", - "| 488|365447|\n", - "| 489|365454|\n", - "| 487|365519|\n", - "| 486|365528|\n", - "| 485|365536|\n", - "| 482|365541|\n", - "| 479|365547|\n", - "| 478|365552|\n", - "| 477|365554|\n", - "| 480|365569|\n", - "| 483|365574|\n", - "| 474|365576|\n", - "| 484|365595|\n", - "| 475|365602|\n", - "| 481|365622|\n", - "| 476|365622|\n", - "| 473|365650|\n", - "| 472|365684|\n", - "| 471|365705|\n", - "| 469|365750|\n", - "| 468|365773|\n", - "| 467|365793|\n", - "| 470|365801|\n", - "| 464|365806|\n", - "| 465|365806|\n", - "| 463|365828|\n", - "| 466|365846|\n", - "| 462|365909|\n", - "| 461|365965|\n", - "| 460|365975|\n", - "| 459|366026|\n", - "| 456|366051|\n", - "| 457|366057|\n", - "| 458|366080|\n", - "| 454|366105|\n", - "| 455|366117|\n", - "| 452|366150|\n", - "| 453|366160|\n", - "| 448|366193|\n", - "| 451|366200|\n", - "| 450|366214|\n", - "| 449|366217|\n", - "| 446|366297|\n", - "| 447|366320|\n", - "| 445|366371|\n", - "| 444|366383|\n", - "| 443|366422|\n", - "| 442|366461|\n", - "| 441|366589|\n", - "| 440|366617|\n", - "| 439|366758|\n", - "| 438|366799|\n", - "| 437|366883|\n", - "| 436|366901|\n", - "| 435|366940|\n", - "| 434|367122|\n", + "| 187|329787|\n", + "| 192|329793|\n", + "| 183|329804|\n", + "| 184|329805|\n", + "| 179|329807|\n", + "| 182|329814|\n", + "| 181|329819|\n", + "| 185|329843|\n", + "| 178|329849|\n", + "| 177|329861|\n", + "| 180|329865|\n", + "| 172|329866|\n", + "| 173|329871|\n", + "| 171|329879|\n", + "| 175|329889|\n", + "| 174|329892|\n", + "| 176|329900|\n", + "| 170|329937|\n", + "| 168|329956|\n", + "| 162|329961|\n", + "| 167|329962|\n", + "| 164|329965|\n", + "| 169|329974|\n", + "| 165|329980|\n", + "| 163|329985|\n", + "| 159|329993|\n", + "| 166|330000|\n", + "| 158|330005|\n", + "| 161|330014|\n", + "| 157|330021|\n", + "| 160|330027|\n", + "| 156|330059|\n", + "| 154|330066|\n", + "| 147|330070|\n", + "| 153|330083|\n", + "| 155|330084|\n", + "| 150|330084|\n", + "| 146|330089|\n", + "| 152|330090|\n", + "| 149|330099|\n", + "| 151|330101|\n", + "| 148|330103|\n", + "| 144|330116|\n", + "| 145|330129|\n", + "| 140|330133|\n", + "| 143|330135|\n", + "| 141|330139|\n", + "| 142|330148|\n", + "| 139|330159|\n", + "| 135|330178|\n", + "| 137|330181|\n", + "| 132|330186|\n", + "| 138|330189|\n", + "| 136|330205|\n", + "| 129|330208|\n", + "| 133|330219|\n", + "| 125|330220|\n", + "| 128|330227|\n", + "| 134|330230|\n", + "| 127|330236|\n", + "| 130|330243|\n", + "| 124|330245|\n", + "| 117|330249|\n", + "| 131|330252|\n", + "| 121|330255|\n", + "| 122|330258|\n", + "| 126|330268|\n", + "| 123|330270|\n", + "| 118|330275|\n", + "| 115|330282|\n", + "| 113|330282|\n", + "| 120|330296|\n", + "| 112|330297|\n", + "| 119|330306|\n", + "| 114|330308|\n", + "| 104|330338|\n", + "| 116|330339|\n", + "| 109|330355|\n", + "| 105|330363|\n", + "| 111|330366|\n", + "| 110|330372|\n", + "| 108|330383|\n", + "| 107|330385|\n", + "| 106|330402|\n", + "| 96|330413|\n", + "| 102|330415|\n", + "| 100|330418|\n", + "| 98|330418|\n", + "| 101|330419|\n", + "| 94|330425|\n", + "| 97|330426|\n", + "| 103|330430|\n", + "| 99|330430|\n", + "| 95|330436|\n", + "| 90|330456|\n", + "| 92|330470|\n", + "| 88|330471|\n", + "| 91|330474|\n", + "| 93|330477|\n", + "| 87|330494|\n", + "| 86|330501|\n", + "| 89|330512|\n", + "| 84|330524|\n", + "| 82|330550|\n", + "| 80|330570|\n", + "| 85|330570|\n", + "| 81|330579|\n", + "| 83|330585|\n", + "| 78|330622|\n", + "| 79|330625|\n", + "| 76|330625|\n", + "| 75|330642|\n", + "| 77|330646|\n", + "| 71|330651|\n", + "| 74|330653|\n", + "| 73|330667|\n", + "| 70|330690|\n", + "| 65|330704|\n", + "| 72|330709|\n", + "| 67|330721|\n", + "| 66|330724|\n", + "| 69|330737|\n", + "| 62|330758|\n", + "| 63|330762|\n", + "| 64|330762|\n", + "| 68|330766|\n", + "| 60|330782|\n", + "| 59|330784|\n", + "| 56|330801|\n", + "| 57|330805|\n", + "| 61|330807|\n", + "| 58|330837|\n", + "| 53|330868|\n", + "| 55|330869|\n", + "| 54|330885|\n", + "| 52|330918|\n", + "| 50|330944|\n", + "| 51|330963|\n", + "| 49|331028|\n", + "| 48|331034|\n", + "| 47|331050|\n", + "| 46|331114|\n", + "| 45|331284|\n", + "| 44|331416|\n", + "| 543|364094|\n", + "| 542|364374|\n", + "| 541|364493|\n", + "| 537|364581|\n", + "| 538|364599|\n", + "| 539|364616|\n", + "| 540|364617|\n", + "| 536|364654|\n", + "| 534|364709|\n", + "| 535|364756|\n", + "| 532|364784|\n", + "| 533|364810|\n", + "| 529|364899|\n", + "| 530|364903|\n", + "| 531|364944|\n", + "| 528|364957|\n", + "| 527|364961|\n", + "| 524|364971|\n", + "| 525|364988|\n", + "| 526|365006|\n", + "| 522|365011|\n", + "| 523|365051|\n", + "| 521|365057|\n", + "| 520|365079|\n", + "| 518|365083|\n", + "| 517|365090|\n", + "| 519|365097|\n", + "| 516|365122|\n", + "| 514|365165|\n", + "| 515|365179|\n", + "| 513|365224|\n", + "| 509|365252|\n", + "| 506|365253|\n", + "| 511|365255|\n", + "| 508|365272|\n", + "| 510|365277|\n", + "| 512|365278|\n", + "| 507|365302|\n", + "| 505|365347|\n", + "| 502|365377|\n", + "| 503|365394|\n", + "| 504|365395|\n", + "| 501|365409|\n", + "| 500|365431|\n", + "| 498|365447|\n", + "| 499|365454|\n", + "| 497|365519|\n", + "| 496|365528|\n", + "| 495|365536|\n", + "| 492|365541|\n", + "| 489|365547|\n", + "| 488|365552|\n", + "| 487|365554|\n", + "| 490|365569|\n", + "| 493|365574|\n", + "| 484|365576|\n", + "| 494|365595|\n", + "| 485|365602|\n", + "| 486|365622|\n", + "| 491|365622|\n", + "| 483|365650|\n", + "| 482|365684|\n", + "| 481|365705|\n", + "| 479|365750|\n", + "| 478|365773|\n", + "| 477|365793|\n", + "| 480|365801|\n", + "| 475|365806|\n", + "| 474|365806|\n", + "| 473|365828|\n", + "| 476|365846|\n", + "| 472|365909|\n", + "| 471|365965|\n", + "| 470|365975|\n", + "| 469|366026|\n", + "| 466|366051|\n", + "| 467|366057|\n", + "| 468|366080|\n", + "| 464|366105|\n", + "| 465|366117|\n", + "| 462|366150|\n", + "| 463|366160|\n", + "| 458|366193|\n", + "| 461|366200|\n", + "| 460|366214|\n", + "| 459|366217|\n", + "| 456|366297|\n", + "| 457|366320|\n", + "| 455|366371|\n", + "| 454|366383|\n", + "| 453|366422|\n", + "| 452|366461|\n", + "| 451|366589|\n", + "| 450|366617|\n", + "| 449|366758|\n", + "| 448|366799|\n", + "| 447|366883|\n", + "| 446|366901|\n", + "| 445|366940|\n", + "| 444|367122|\n", "| 21|380513|\n", "| 20|380565|\n", "| 19|380749|\n", @@ -536,217 +556,207 @@ "| 1|382022|\n", "| 2|382029|\n", "| 0|382095|\n", - "| 332|420259|\n", - "| 333|420346|\n", - "| 331|420485|\n", - "| 330|420525|\n", - "| 329|420707|\n", - "| 326|421031|\n", - "| 327|421040|\n", - "| 328|421052|\n", - "| 324|421107|\n", - "| 325|421142|\n", - "| 323|421374|\n", - "| 320|421440|\n", - "| 322|421479|\n", - "| 321|421531|\n", - "| 317|421574|\n", - "| 318|421603|\n", - "| 319|421610|\n", - "| 316|421612|\n", - "| 312|421670|\n", - "| 310|421675|\n", - "| 315|421679|\n", - "| 314|421681|\n", - "| 313|421687|\n", - "| 309|421687|\n", - "| 311|421699|\n", - "| 308|421751|\n", - "| 305|421832|\n", - "| 300|421867|\n", - "| 306|421897|\n", - "| 302|421903|\n", - "| 307|421911|\n", - "| 304|421918|\n", - "| 303|421920|\n", - "| 301|421950|\n", - "| 299|421972|\n", - "| 297|421988|\n", - "| 298|422019|\n", - "| 295|422072|\n", - "| 293|422083|\n", - "| 296|422091|\n", - "| 294|422095|\n", - "| 292|422097|\n", - "| 288|422103|\n", - "| 290|422114|\n", - "| 291|422116|\n", - "| 285|422134|\n", - "| 289|422155|\n", - "| 286|422155|\n", - "| 280|422185|\n", - "| 287|422193|\n", - "| 284|422194|\n", - "| 282|422207|\n", - "| 281|422218|\n", - "| 283|422236|\n", - "| 278|422238|\n", - "| 276|422255|\n", - "| 279|422265|\n", - "| 277|422266|\n", - "| 275|422305|\n", - "| 273|422307|\n", - "| 274|422346|\n", - "| 272|422350|\n", - "| 271|422354|\n", - "| 270|422372|\n", - "| 269|422415|\n", - "| 268|422498|\n", - "| 267|422501|\n", - "| 266|422508|\n", - "| 265|422549|\n", - "| 264|422557|\n", - "| 263|422591|\n", - "| 262|422625|\n", - "| 260|422634|\n", - "| 259|422671|\n", - "| 258|422673|\n", - "| 261|422692|\n", - "| 257|422694|\n", - "| 255|422761|\n", - "| 252|422777|\n", - "| 250|422788|\n", - "| 253|422795|\n", - "| 256|422803|\n", - "| 254|422807|\n", - "| 248|422838|\n", - "| 249|422839|\n", - "| 251|422841|\n", - "| 247|422852|\n", - "| 246|422891|\n", - "| 242|422904|\n", - "| 245|422925|\n", - "| 244|422986|\n", - "| 243|423003|\n", - "| 240|423197|\n", - "| 241|423202|\n", - "| 238|423231|\n", - "| 239|423262|\n", - "| 236|423376|\n", - "| 237|423402|\n", - "| 235|423403|\n", - "| 234|423762|\n", - "| 433|569570|\n", - "| 432|570154|\n", - "| 431|570301|\n", - "| 430|570372|\n", - "| 429|570572|\n", - "| 428|570655|\n", - "| 426|570763|\n", - "| 427|570781|\n", - "| 424|570870|\n", - "| 425|570872|\n", - "| 423|570953|\n", - "| 422|570979|\n", - "| 421|571069|\n", - "| 419|571096|\n", - "| 420|571097|\n", - "| 418|571127|\n", - "| 417|571153|\n", - "| 416|571185|\n", - "| 415|571201|\n", - "| 414|571286|\n", - "| 413|571425|\n", - "| 412|571449|\n", - "| 407|571506|\n", - "| 410|571528|\n", - "| 411|571532|\n", - "| 409|571553|\n", - "| 408|571585|\n", - "| 406|571595|\n", - "| 404|571645|\n", - "| 405|571657|\n", - "| 403|571742|\n", - "| 402|571766|\n", - "| 401|571796|\n", - "| 399|571842|\n", - "| 400|571847|\n", - "| 397|571874|\n", - "| 398|571913|\n", - "| 396|571925|\n", - "| 395|571966|\n", - "| 394|571983|\n", - "| 392|571993|\n", - "| 393|572020|\n", - "| 391|572123|\n", - "| 387|572181|\n", - "| 390|572182|\n", - "| 389|572183|\n", - "| 388|572189|\n", - "| 386|572212|\n", - "| 385|572244|\n", - "| 383|572249|\n", - "| 384|572276|\n", - "| 382|572302|\n", - "| 381|572344|\n", - "| 380|572361|\n", - "| 379|572382|\n", - "| 378|572394|\n", - "| 377|572428|\n", - "| 376|572438|\n", - "| 375|572493|\n", - "| 373|572545|\n", - "| 374|572565|\n", - "| 372|572569|\n", - "| 371|572600|\n", - "| 370|572604|\n", - "| 369|572634|\n", - "| 368|572647|\n", - "| 366|572742|\n", - "| 365|572742|\n", - "| 367|572755|\n", - "| 364|572798|\n", - "| 362|572800|\n", - "| 363|572816|\n", - "| 361|572868|\n", - "| 360|572895|\n", - "| 359|572907|\n", - "| 358|572924|\n", - "| 357|572957|\n", - "| 356|573022|\n", - "| 354|573102|\n", - "| 355|573104|\n", - "| 352|573132|\n", - "| 353|573145|\n", - "| 351|573173|\n", - "| 350|573187|\n", - "| 348|573262|\n", - "| 349|573270|\n", - "| 347|573334|\n", - "| 346|573372|\n", - "| 345|573425|\n", - "| 344|573556|\n", - "| 343|573584|\n", - "| 342|573658|\n", - "| 341|573676|\n", - "| 340|573781|\n", - "| 339|573977|\n", - "| 337|574013|\n", - "| 338|574040|\n", - "| 336|574185|\n", - "| 335|574318|\n", - "| 334|574727|\n", - "| 32|610310|\n", - "| 31|610736|\n", - "| 30|610951|\n", - "| 29|611294|\n", - "| 28|611761|\n", - "| 27|611933|\n", - "| 26|612048|\n", - "| 25|612117|\n", - "| 24|612529|\n", - "| 23|613100|\n", - "| 22|613957|\n", + "| 342|420259|\n", + "| 343|420346|\n", + "| 341|420485|\n", + "| 340|420525|\n", + "| 339|420707|\n", + "| 336|421031|\n", + "| 337|421040|\n", + "| 338|421052|\n", + "| 334|421107|\n", + "| 335|421142|\n", + "| 333|421374|\n", + "| 330|421440|\n", + "| 332|421479|\n", + "| 331|421531|\n", + "| 327|421574|\n", + "| 328|421603|\n", + "| 329|421610|\n", + "| 326|421612|\n", + "| 322|421670|\n", + "| 320|421675|\n", + "| 325|421679|\n", + "| 324|421681|\n", + "| 319|421687|\n", + "| 323|421687|\n", + "| 321|421699|\n", + "| 318|421751|\n", + "| 315|421832|\n", + "| 310|421867|\n", + "| 316|421897|\n", + "| 312|421903|\n", + "| 317|421911|\n", + "| 314|421918|\n", + "| 313|421920|\n", + "| 311|421950|\n", + "| 309|421972|\n", + "| 307|421988|\n", + "| 308|422019|\n", + "| 305|422072|\n", + "| 303|422083|\n", + "| 306|422091|\n", + "| 304|422095|\n", + "| 302|422097|\n", + "| 298|422103|\n", + "| 300|422114|\n", + "| 301|422116|\n", + "| 295|422134|\n", + "| 299|422155|\n", + "| 296|422155|\n", + "| 290|422185|\n", + "| 297|422193|\n", + "| 294|422194|\n", + "| 292|422207|\n", + "| 291|422218|\n", + "| 293|422236|\n", + "| 288|422238|\n", + "| 286|422255|\n", + "| 289|422265|\n", + "| 287|422266|\n", + "| 285|422305|\n", + "| 283|422307|\n", + "| 284|422346|\n", + "| 282|422350|\n", + "| 281|422354|\n", + "| 280|422372|\n", + "| 279|422415|\n", + "| 278|422498|\n", + "| 277|422501|\n", + "| 276|422508|\n", + "| 275|422549|\n", + "| 274|422557|\n", + "| 273|422591|\n", + "| 272|422625|\n", + "| 270|422634|\n", + "| 269|422671|\n", + "| 268|422673|\n", + "| 271|422692|\n", + "| 267|422694|\n", + "| 265|422761|\n", + "| 262|422777|\n", + "| 260|422788|\n", + "| 263|422795|\n", + "| 266|422803|\n", + "| 264|422807|\n", + "| 258|422838|\n", + "| 259|422839|\n", + "| 261|422841|\n", + "| 257|422852|\n", + "| 256|422891|\n", + "| 252|422904|\n", + "| 255|422925|\n", + "| 254|422986|\n", + "| 253|423003|\n", + "| 250|423197|\n", + "| 251|423202|\n", + "| 248|423231|\n", + "| 249|423262|\n", + "| 246|423376|\n", + "| 247|423402|\n", + "| 245|423403|\n", + "| 244|423762|\n", + "| 43|457702|\n", + "| 443|569570|\n", + "| 442|570154|\n", + "| 441|570301|\n", + "| 440|570372|\n", + "| 439|570572|\n", + "| 438|570655|\n", + "| 436|570763|\n", + "| 437|570781|\n", + "| 434|570870|\n", + "| 435|570872|\n", + "| 433|570953|\n", + "| 432|570979|\n", + "| 431|571069|\n", + "| 429|571096|\n", + "| 430|571097|\n", + "| 428|571127|\n", + "| 427|571153|\n", + "| 426|571185|\n", + "| 425|571201|\n", + "| 424|571286|\n", + "| 423|571425|\n", + "| 422|571449|\n", + "| 417|571506|\n", + "| 420|571528|\n", + "| 421|571532|\n", + "| 419|571553|\n", + "| 418|571585|\n", + "| 416|571595|\n", + "| 414|571645|\n", + "| 415|571657|\n", + "| 413|571742|\n", + "| 412|571766|\n", + "| 411|571796|\n", + "| 409|571842|\n", + "| 410|571847|\n", + "| 407|571874|\n", + "| 408|571913|\n", + "| 406|571925|\n", + "| 405|571966|\n", + "| 404|571983|\n", + "| 402|571993|\n", + "| 403|572020|\n", + "| 401|572123|\n", + "| 397|572181|\n", + "| 400|572182|\n", + "| 399|572183|\n", + "| 398|572189|\n", + "| 396|572212|\n", + "| 395|572244|\n", + "| 393|572249|\n", + "| 394|572276|\n", + "| 392|572302|\n", + "| 391|572344|\n", + "| 390|572361|\n", + "| 389|572382|\n", + "| 388|572394|\n", + "| 387|572428|\n", + "| 386|572438|\n", + "| 385|572493|\n", + "| 383|572545|\n", + "| 384|572565|\n", + "| 382|572569|\n", + "| 381|572600|\n", + "| 380|572604|\n", + "| 379|572634|\n", + "| 378|572647|\n", + "| 375|572742|\n", + "| 376|572742|\n", + "| 377|572755|\n", + "| 374|572798|\n", + "| 372|572800|\n", + "| 373|572816|\n", + "| 371|572868|\n", + "| 370|572895|\n", + "| 369|572907|\n", + "| 368|572924|\n", + "| 367|572957|\n", + "| 366|573022|\n", + "| 364|573102|\n", + "| 365|573104|\n", + "| 362|573132|\n", + "| 363|573145|\n", + "| 361|573173|\n", + "| 360|573187|\n", + "| 358|573262|\n", + "| 359|573270|\n", + "| 357|573334|\n", + "| 356|573372|\n", + "| 355|573425|\n", + "| 354|573556|\n", + "| 353|573584|\n", + "| 352|573658|\n", + "| 351|573676|\n", + "| 350|573781|\n", + "| 349|573977|\n", + "| 347|574013|\n", + "| 348|574040|\n", + "| 346|574185|\n", + "| 345|574318|\n", + "| 344|574727|\n", "+-----------+------+\n", "\n" ] @@ -780,656 +790,16 @@ "execution_count": 6, "id": "abf8091a-9662-4378-8fe5-b2ece46a6a14", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Stage 34:=====================================================>(533 + 1) / 534]\r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Partitions: 600\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Stage 37:====================================================> (584 + 8) / 600]\r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------+------+\n", - "|partitionId| count|\n", - "+-----------+------+\n", - "| 263|362150|\n", - "| 258|362151|\n", - "| 265|362151|\n", - "| 256|362151|\n", - "| 259|362151|\n", - "| 255|362152|\n", - "| 267|362152|\n", - "| 266|362152|\n", - "| 257|362152|\n", - "| 262|362152|\n", - "| 260|362152|\n", - "| 264|362152|\n", - "| 254|362153|\n", - "| 261|362153|\n", - "| 181|362154|\n", - "| 268|362154|\n", - "| 179|362154|\n", - "| 188|362154|\n", - "| 180|362154|\n", - "| 250|362155|\n", - "| 183|362155|\n", - "| 269|362155|\n", - "| 272|362155|\n", - "| 189|362155|\n", - "| 251|362155|\n", - "| 186|362155|\n", - "| 253|362155|\n", - "| 252|362155|\n", - "| 172|362155|\n", - "| 173|362155|\n", - "| 178|362155|\n", - "| 182|362155|\n", - "| 273|362155|\n", - "| 184|362155|\n", - "| 187|362155|\n", - "| 277|362156|\n", - "| 285|362156|\n", - "| 232|362156|\n", - "| 245|362156|\n", - "| 190|362156|\n", - "| 219|362156|\n", - "| 554|362156|\n", - "| 287|362156|\n", - "| 177|362156|\n", - "| 286|362156|\n", - "| 270|362156|\n", - "| 276|362156|\n", - "| 175|362156|\n", - "| 557|362156|\n", - "| 274|362156|\n", - "| 275|362156|\n", - "| 174|362156|\n", - "| 241|362156|\n", - "| 243|362156|\n", - "| 278|362156|\n", - "| 223|362156|\n", - "| 290|362156|\n", - "| 238|362156|\n", - "| 185|362156|\n", - "| 291|362156|\n", - "| 239|362157|\n", - "| 222|362157|\n", - "| 559|362157|\n", - "| 249|362157|\n", - "| 288|362157|\n", - "| 233|362157|\n", - "| 237|362157|\n", - "| 240|362157|\n", - "| 289|362157|\n", - "| 246|362157|\n", - "| 558|362157|\n", - "| 224|362157|\n", - "| 221|362157|\n", - "| 556|362157|\n", - "| 11|362157|\n", - "| 271|362157|\n", - "| 231|362157|\n", - "| 248|362157|\n", - "| 280|362157|\n", - "| 555|362157|\n", - "| 247|362157|\n", - "| 162|362157|\n", - "| 292|362157|\n", - "| 176|362157|\n", - "| 565|362157|\n", - "| 279|362158|\n", - "| 163|362158|\n", - "| 532|362158|\n", - "| 166|362158|\n", - "| 283|362158|\n", - "| 293|362158|\n", - "| 225|362158|\n", - "| 167|362158|\n", - "| 191|362158|\n", - "| 560|362158|\n", - "| 553|362158|\n", - "| 599|362158|\n", - "| 226|362158|\n", - "| 165|362158|\n", - "| 534|362158|\n", - "| 564|362158|\n", - "| 561|362158|\n", - "| 242|362158|\n", - "| 531|362158|\n", - "| 566|362158|\n", - "| 218|362158|\n", - "| 281|362158|\n", - "| 236|362158|\n", - "| 282|362158|\n", - "| 0|362158|\n", - "| 168|362158|\n", - "| 220|362158|\n", - "| 161|362158|\n", - "| 244|362158|\n", - "| 234|362158|\n", - "| 164|362158|\n", - "| 535|362158|\n", - "| 550|362158|\n", - "| 171|362158|\n", - "| 230|362158|\n", - "| 563|362158|\n", - "| 533|362158|\n", - "| 157|362159|\n", - "| 597|362159|\n", - "| 598|362159|\n", - "| 1|362159|\n", - "| 551|362159|\n", - "| 549|362159|\n", - "| 209|362159|\n", - "| 210|362159|\n", - "| 569|362159|\n", - "| 216|362159|\n", - "| 235|362159|\n", - "| 5|362159|\n", - "| 160|362159|\n", - "| 490|362159|\n", - "| 227|362159|\n", - "| 488|362159|\n", - "| 228|362159|\n", - "| 489|362159|\n", - "| 536|362159|\n", - "| 204|362159|\n", - "| 192|362159|\n", - "| 159|362159|\n", - "| 567|362159|\n", - "| 552|362159|\n", - "| 284|362159|\n", - "| 2|362159|\n", - "| 13|362159|\n", - "| 294|362159|\n", - "| 211|362159|\n", - "| 215|362159|\n", - "| 562|362159|\n", - "| 6|362159|\n", - "| 12|362159|\n", - "| 530|362159|\n", - "| 568|362159|\n", - "| 7|362159|\n", - "| 3|362159|\n", - "| 229|362159|\n", - "| 537|362159|\n", - "| 169|362159|\n", - "| 217|362159|\n", - "| 208|362159|\n", - "| 205|362160|\n", - "| 538|362160|\n", - "| 158|362160|\n", - "| 197|362160|\n", - "| 8|362160|\n", - "| 194|362160|\n", - "| 10|362160|\n", - "| 491|362160|\n", - "| 591|362160|\n", - "| 156|362160|\n", - "| 526|362160|\n", - "| 539|362160|\n", - "| 214|362160|\n", - "| 492|362160|\n", - "| 207|362160|\n", - "| 523|362160|\n", - "| 170|362160|\n", - "| 206|362160|\n", - "| 525|362160|\n", - "| 212|362160|\n", - "| 571|362160|\n", - "| 196|362160|\n", - "| 193|362160|\n", - "| 572|362160|\n", - "| 4|362160|\n", - "| 296|362160|\n", - "| 547|362160|\n", - "| 295|362160|\n", - "| 570|362160|\n", - "| 486|362161|\n", - "| 155|362161|\n", - "| 589|362161|\n", - "| 403|362161|\n", - "| 404|362161|\n", - "| 510|362161|\n", - "| 545|362161|\n", - "| 25|362161|\n", - "| 203|362161|\n", - "| 515|362161|\n", - "| 590|362161|\n", - "| 300|362161|\n", - "| 213|362161|\n", - "| 512|362161|\n", - "| 387|362161|\n", - "| 405|362161|\n", - "| 529|362161|\n", - "| 514|362161|\n", - "| 307|362161|\n", - "| 493|362161|\n", - "| 543|362161|\n", - "| 509|362161|\n", - "| 527|362161|\n", - "| 544|362161|\n", - "| 596|362161|\n", - "| 511|362161|\n", - "| 15|362161|\n", - "| 593|362161|\n", - "| 595|362161|\n", - "| 151|362161|\n", - "| 195|362161|\n", - "| 592|362161|\n", - "| 487|362161|\n", - "| 383|362161|\n", - "| 152|362161|\n", - "| 298|362161|\n", - "| 105|362161|\n", - "| 508|362161|\n", - "| 202|362161|\n", - "| 297|362161|\n", - "| 9|362161|\n", - "| 522|362161|\n", - "| 199|362161|\n", - "| 540|362161|\n", - "| 506|362161|\n", - "| 594|362161|\n", - "| 573|362161|\n", - "| 153|362161|\n", - "| 407|362161|\n", - "| 517|362161|\n", - "| 513|362161|\n", - "| 441|362161|\n", - "| 198|362161|\n", - "| 14|362161|\n", - "| 548|362161|\n", - "| 101|362161|\n", - "| 306|362161|\n", - "| 516|362162|\n", - "| 312|362162|\n", - "| 582|362162|\n", - "| 576|362162|\n", - "| 497|362162|\n", - "| 299|362162|\n", - "| 390|362162|\n", - "| 26|362162|\n", - "| 580|362162|\n", - "| 581|362162|\n", - "| 435|362162|\n", - "| 541|362162|\n", - "| 110|362162|\n", - "| 106|362162|\n", - "| 100|362162|\n", - "| 301|362162|\n", - "| 500|362162|\n", - "| 442|362162|\n", - "| 308|362162|\n", - "| 546|362162|\n", - "| 389|362162|\n", - "| 431|362162|\n", - "| 432|362162|\n", - "| 438|362162|\n", - "| 129|362162|\n", - "| 507|362162|\n", - "| 408|362162|\n", - "| 528|362162|\n", - "| 112|362162|\n", - "| 111|362162|\n", - "| 519|362162|\n", - "| 27|362162|\n", - "| 433|362162|\n", - "| 64|362162|\n", - "| 496|362162|\n", - "| 382|362162|\n", - "| 200|362162|\n", - "| 109|362162|\n", - "| 378|362162|\n", - "| 574|362162|\n", - "| 406|362162|\n", - "| 107|362162|\n", - "| 17|362162|\n", - "| 385|362162|\n", - "| 201|362162|\n", - "| 440|362162|\n", - "| 583|362162|\n", - "| 16|362162|\n", - "| 495|362162|\n", - "| 114|362162|\n", - "| 104|362162|\n", - "| 384|362162|\n", - "| 388|362162|\n", - "| 439|362162|\n", - "| 434|362162|\n", - "| 309|362162|\n", - "| 313|362162|\n", - "| 302|362162|\n", - "| 524|362162|\n", - "| 108|362162|\n", - "| 386|362162|\n", - "| 394|362163|\n", - "| 24|362163|\n", - "| 398|362163|\n", - "| 85|362163|\n", - "| 314|362163|\n", - "| 400|362163|\n", - "| 28|362163|\n", - "| 102|362163|\n", - "| 436|362163|\n", - "| 372|362163|\n", - "| 113|362163|\n", - "| 542|362163|\n", - "| 504|362163|\n", - "| 412|362163|\n", - "| 65|362163|\n", - "| 505|362163|\n", - "| 373|362163|\n", - "| 499|362163|\n", - "| 503|362163|\n", - "| 131|362163|\n", - "| 381|362163|\n", - "| 498|362163|\n", - "| 86|362163|\n", - "| 399|362163|\n", - "| 494|362163|\n", - "| 18|362163|\n", - "| 521|362163|\n", - "| 401|362163|\n", - "| 89|362163|\n", - "| 375|362163|\n", - "| 311|362163|\n", - "| 586|362163|\n", - "| 585|362163|\n", - "| 154|362163|\n", - "| 402|362163|\n", - "| 94|362163|\n", - "| 128|362163|\n", - "| 395|362163|\n", - "| 518|362163|\n", - "| 70|362163|\n", - "| 579|362163|\n", - "| 501|362163|\n", - "| 304|362163|\n", - "| 575|362163|\n", - "| 502|362163|\n", - "| 127|362163|\n", - "| 71|362163|\n", - "| 379|362163|\n", - "| 587|362163|\n", - "| 103|362163|\n", - "| 437|362163|\n", - "| 584|362163|\n", - "| 130|362163|\n", - "| 305|362163|\n", - "| 115|362163|\n", - "| 588|362163|\n", - "| 520|362163|\n", - "| 409|362163|\n", - "| 377|362163|\n", - "| 96|362164|\n", - "| 391|362164|\n", - "| 359|362164|\n", - "| 397|362164|\n", - "| 145|362164|\n", - "| 149|362164|\n", - "| 410|362164|\n", - "| 310|362164|\n", - "| 87|362164|\n", - "| 133|362164|\n", - "| 22|362164|\n", - "| 77|362164|\n", - "| 19|362164|\n", - "| 360|362164|\n", - "| 471|362164|\n", - "| 411|362164|\n", - "| 29|362164|\n", - "| 371|362164|\n", - "| 90|362164|\n", - "| 392|362164|\n", - "| 376|362164|\n", - "| 444|362164|\n", - "| 132|362164|\n", - "| 374|362164|\n", - "| 83|362164|\n", - "| 72|362164|\n", - "| 88|362164|\n", - "| 363|362164|\n", - "| 466|362164|\n", - "| 66|362164|\n", - "| 67|362164|\n", - "| 449|362164|\n", - "| 144|362164|\n", - "| 20|362164|\n", - "| 353|362164|\n", - "| 443|362164|\n", - "| 315|362164|\n", - "| 393|362164|\n", - "| 362|362164|\n", - "| 578|362164|\n", - "| 69|362164|\n", - "| 467|362164|\n", - "| 63|362164|\n", - "| 396|362164|\n", - "| 143|362164|\n", - "| 577|362164|\n", - "| 303|362164|\n", - "| 116|362164|\n", - "| 23|362164|\n", - "| 380|362164|\n", - "| 93|362165|\n", - "| 413|362165|\n", - "| 30|362165|\n", - "| 448|362165|\n", - "| 361|362165|\n", - "| 364|362165|\n", - "| 464|362165|\n", - "| 455|362165|\n", - "| 473|362165|\n", - "| 316|362165|\n", - "| 451|362165|\n", - "| 146|362165|\n", - "| 150|362165|\n", - "| 68|362165|\n", - "| 454|362165|\n", - "| 123|362165|\n", - "| 348|362165|\n", - "| 124|362165|\n", - "| 99|362165|\n", - "| 134|362165|\n", - "| 126|362165|\n", - "| 429|362165|\n", - "| 84|362165|\n", - "| 91|362165|\n", - "| 97|362165|\n", - "| 82|362165|\n", - "| 76|362165|\n", - "| 92|362165|\n", - "| 21|362165|\n", - "| 140|362165|\n", - "| 75|362165|\n", - "| 61|362165|\n", - "| 354|362165|\n", - "| 370|362165|\n", - "| 147|362165|\n", - "| 79|362165|\n", - "| 118|362165|\n", - "| 475|362165|\n", - "| 468|362165|\n", - "| 142|362165|\n", - "| 136|362165|\n", - "| 120|362165|\n", - "| 430|362165|\n", - "| 125|362165|\n", - "| 95|362165|\n", - "| 352|362165|\n", - "| 135|362165|\n", - "| 445|362165|\n", - "| 78|362165|\n", - "| 73|362166|\n", - "| 416|362166|\n", - "| 81|362166|\n", - "| 369|362166|\n", - "| 456|362166|\n", - "| 55|362166|\n", - "| 350|362166|\n", - "| 470|362166|\n", - "| 428|362166|\n", - "| 137|362166|\n", - "| 427|362166|\n", - "| 80|362166|\n", - "| 148|362166|\n", - "| 32|362166|\n", - "| 117|362166|\n", - "| 141|362166|\n", - "| 414|362166|\n", - "| 469|362166|\n", - "| 347|362166|\n", - "| 56|362166|\n", - "| 366|362166|\n", - "| 358|362166|\n", - "| 368|362166|\n", - "| 365|362166|\n", - "| 450|362166|\n", - "| 474|362166|\n", - "| 138|362166|\n", - "| 74|362166|\n", - "| 59|362166|\n", - "| 317|362166|\n", - "| 446|362166|\n", - "| 355|362166|\n", - "| 465|362166|\n", - "| 459|362166|\n", - "| 62|362166|\n", - "| 351|362166|\n", - "| 477|362166|\n", - "| 476|362166|\n", - "| 457|362166|\n", - "| 447|362166|\n", - "| 426|362166|\n", - "| 461|362166|\n", - "| 460|362166|\n", - "| 472|362166|\n", - "| 462|362166|\n", - "| 31|362166|\n", - "| 122|362166|\n", - "| 453|362166|\n", - "| 57|362166|\n", - "| 121|362166|\n", - "| 98|362166|\n", - "| 33|362167|\n", - "| 483|362167|\n", - "| 54|362167|\n", - "| 479|362167|\n", - "| 452|362167|\n", - "| 356|362167|\n", - "| 463|362167|\n", - "| 53|362167|\n", - "| 60|362167|\n", - "| 484|362167|\n", - "| 478|362167|\n", - "| 58|362167|\n", - "| 52|362167|\n", - "| 119|362167|\n", - "| 357|362167|\n", - "| 422|362167|\n", - "| 420|362167|\n", - "| 318|362167|\n", - "| 139|362167|\n", - "| 419|362167|\n", - "| 367|362167|\n", - "| 415|362167|\n", - "| 458|362167|\n", - "| 425|362167|\n", - "| 319|362167|\n", - "| 349|362167|\n", - "| 418|362167|\n", - "| 328|362167|\n", - "| 485|362168|\n", - "| 480|362168|\n", - "| 421|362168|\n", - "| 329|362168|\n", - "| 423|362168|\n", - "| 326|362168|\n", - "| 481|362168|\n", - "| 424|362168|\n", - "| 417|362168|\n", - "| 51|362168|\n", - "| 331|362168|\n", - "| 482|362168|\n", - "| 322|362168|\n", - "| 320|362168|\n", - "| 324|362168|\n", - "| 321|362168|\n", - "| 323|362168|\n", - "| 37|362169|\n", - "| 330|362169|\n", - "| 50|362169|\n", - "| 346|362169|\n", - "| 38|362169|\n", - "| 325|362169|\n", - "| 35|362169|\n", - "| 34|362169|\n", - "| 327|362169|\n", - "| 332|362170|\n", - "| 336|362170|\n", - "| 345|362170|\n", - "| 334|362170|\n", - "| 40|362170|\n", - "| 48|362170|\n", - "| 39|362170|\n", - "| 333|362170|\n", - "| 47|362171|\n", - "| 44|362171|\n", - "| 49|362171|\n", - "| 43|362171|\n", - "| 338|362171|\n", - "| 337|362171|\n", - "| 41|362171|\n", - "| 335|362171|\n", - "| 42|362171|\n", - "| 46|362171|\n", - "| 36|362171|\n", - "| 343|362172|\n", - "| 339|362172|\n", - "| 45|362172|\n", - "| 340|362173|\n", - "| 341|362173|\n", - "| 342|362173|\n", - "| 344|362173|\n", - "+-----------+------+\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - } - ], + "outputs": [], "source": [ "# repartitioning to 600 partitions, seems to be balanced now. \n", "df_all = df_all.repartition(600)\n", - "displaypartitions(df_all)" + "#displaypartitions(df_all)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "9c7c7fa9-7a39-46eb-93fd-c7006d01c03e", "metadata": {}, "outputs": [], @@ -1504,7 +874,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "540c7bff-6eac-40c2-a9fe-9b6843f7d546", "metadata": {}, "outputs": [], @@ -1524,7 +894,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 9, "id": "3ec0caae-bb26-42a5-86ae-93710741c4e8", "metadata": {}, "outputs": [], @@ -1535,7 +905,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 10, "id": "855586ae-d82a-4cc9-97f2-2d1b0a3d5c0d", "metadata": {}, "outputs": [ @@ -1566,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 11, "id": "40183d3d-13da-4f01-aa6b-d6b47426b5e8", "metadata": {}, "outputs": [], @@ -1623,7 +993,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "id": "14b0b19a-36ad-4e17-bf30-1e9fcdaea452", "metadata": {}, "outputs": [], @@ -1635,7 +1005,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "id": "388446e2-2cd4-4bb8-bfa0-f204b1359427", "metadata": {}, "outputs": [], @@ -1648,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "id": "3e028095-2ff1-4e5f-aa03-7dafe417168e", "metadata": {}, "outputs": [ @@ -1658,7 +1028,7 @@ "DataFrame[day: int, month: int, year: int, area: int, total_counts: bigint]" ] }, - "execution_count": 8, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1732,7 +1102,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, "id": "d57f2088-9ca6-4cc8-a8fe-0cca835fecf9", "metadata": {}, "outputs": [ @@ -1740,7 +1110,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "23/11/22 22:08:31 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", + "23/11/24 16:13:18 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", " \r" ] }, @@ -1748,30 +1118,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", - "|day|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 11| 13| 14| 16| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 38| 39| 41| 42| 43| 44| 46| 50| 52| 56| 59| 60| 61| 62| 63| 65| 66| 67| 68| 69| 70| 71| 72| 75| 76| 77|\n", - "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", - "| 2| 7|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|\n", - "| 4| 12|2022|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|\n", - "| 5| 10|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 3| 5|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null| 1|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 2| 12|2018|null|null|null|null|null|null|null|null|null|null|null|null| 1| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 5| 6|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 5| 7|2019|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 6| 6|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 3| 11|2022|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|\n", - "| 4| 2|2023|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 7| 5|2022|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 6| 1|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 5| 1|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 1| 1|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|\n", - "| 6| 4|2019|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 5| 12|2021|null|null|null|null|null| 1| 1| 1|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 1| 12|2021|null|null|null|null|null|null| 1| 1|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 6| 2|2019|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 4| 4|2023|null|null|null|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "| 2| 3|2019|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", - "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", + "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", + "|day|month|year| 1| 2| 3| 4| 5| 6| 7| 8| 11| 13| 14| 15| 16| 17| 19| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 33| 34| 35| 36| 39| 40| 41| 42| 43| 44| 45| 46| 48| 49| 51| 53| 56| 57| 58| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 75| 76| 77|\n", + "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", + "| 2| 7|2021|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 10|2021|null|null| 1|null|null|null|null| 2|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|\n", + "| 4| 12|2022|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 2| 12|2018|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 3| 5|2023|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 3| 4|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 6|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 7|2019|null|null|null| 1|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|\n", + "| 7| 12|2022|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 6| 6|2021|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 5|2021|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 2| 2|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 3| 11|2022|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 4| 2|2023|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 6| 2|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 7| 8|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 5| 1|2023|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 6| 1|2021|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "| 7| 5|2022|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null| 1|null|\n", + "| 1| 1|2021|null|null|null|null|null|null|null|null| 1|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|\n", + "+---+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+\n", "only showing top 20 rows\n", "\n" ] @@ -1790,40 +1160,6 @@ "pivoted_df.show()" ] }, - { - "cell_type": "code", - "execution_count": 44, - "id": "3a65e426-7aef-4891-942d-538025cd845e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "ename": "AnalysisException", - "evalue": "Cannot resolve column name \"count\" among (ID, start_timestamp, end_timestamp, seconds, miles, pickup_tract, dropoff_tract, pickup_area, dropoff_area, Fare, Tip, total, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, month, day_of_month, hour, day, year, Hyde_Park, Kenwood, Woodlawn)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[44], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# the output of the sample df above looks off. investigate\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# pivot so that each area is a column\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# should probably create a new variable that denotes in program rides, and figure out what combination of pickup or dropoff area we want to u\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m pivoted_df \u001b[38;5;241m=\u001b[39m \u001b[43msample_df\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupBy\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mday\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmonth\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43myear\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdropoff_area\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcount\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfirst\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/group.py:114\u001b[0m, in \u001b[0;36mGroupedData.agg\u001b[0;34m(self, *exprs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m exprs, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexprs should not be empty\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(exprs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exprs[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 114\u001b[0m jdf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jgd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexprs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# Columns\u001b[39;00m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(c, Column) \u001b[38;5;28;01mfor\u001b[39;00m c \u001b[38;5;129;01min\u001b[39;00m exprs), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mall exprs should be Column\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py:1304\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1298\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1300\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1301\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1303\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1304\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1308\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", - "\u001b[0;31mAnalysisException\u001b[0m: Cannot resolve column name \"count\" among (ID, start_timestamp, end_timestamp, seconds, miles, pickup_tract, dropoff_tract, pickup_area, dropoff_area, Fare, Tip, total, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, month, day_of_month, hour, day, year, Hyde_Park, Kenwood, Woodlawn)" - ] - } - ], - "source": [ - "# the output of the sample df above looks off. investigate\n", - "\n", - "#pivoted_df = sample_df.groupBy(\"day\",\"month\",\"year\").pivot(\"dropoff_area\").agg({\"count\": \"first\"})" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1836,18 +1172,10 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "id": "b501c963-15c8-4341-b9c6-7d2f07cc5015", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - } - ], + "outputs": [], "source": [ "df_weather_1 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2018-01-01 to 2020-01-01.csv\", inferSchema=True, header=True)\n", "df_weather_2 = spark.read.csv(\"gs://msca-bdp-student-gcs/bdp-rideshare-project/weather/chicago 2020-01-01 to 2022-08-31.csv\", inferSchema=True, header=True)\n", @@ -1858,7 +1186,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "id": "2858a6da-ef1b-4561-b8e2-f93073b8e803", "metadata": {}, "outputs": [ @@ -1868,35 +1196,210 @@ "text": [ "root\n", " |-- name: string (nullable = true)\n", - " |-- datetime: date (nullable = true)\n", + " |-- datetime: string (nullable = true)\n", " |-- temp: double (nullable = true)\n", " |-- precip: double (nullable = true)\n", " |-- snow: double (nullable = true)\n", " |-- snowdepth: double (nullable = true)\n", " |-- sunset: timestamp (nullable = true)\n", + "\n", + "root\n", + " |-- day: integer (nullable = true)\n", + " |-- month: integer (nullable = true)\n", + " |-- year: integer (nullable = true)\n", + " |-- 1: long (nullable = true)\n", + " |-- 2: long (nullable = true)\n", + " |-- 3: long (nullable = true)\n", + " |-- 4: long (nullable = true)\n", + " |-- 5: long (nullable = true)\n", + " |-- 6: long (nullable = true)\n", + " |-- 7: long (nullable = true)\n", + " |-- 8: long (nullable = true)\n", + " |-- 11: long (nullable = true)\n", + " |-- 13: long (nullable = true)\n", + " |-- 14: long (nullable = true)\n", + " |-- 15: long (nullable = true)\n", + " |-- 16: long (nullable = true)\n", + " |-- 17: long (nullable = true)\n", + " |-- 19: long (nullable = true)\n", + " |-- 21: long (nullable = true)\n", + " |-- 22: long (nullable = true)\n", + " |-- 23: long (nullable = true)\n", + " |-- 24: long (nullable = true)\n", + " |-- 25: long (nullable = true)\n", + " |-- 26: long (nullable = true)\n", + " |-- 27: long (nullable = true)\n", + " |-- 28: long (nullable = true)\n", + " |-- 29: long (nullable = true)\n", + " |-- 30: long (nullable = true)\n", + " |-- 31: long (nullable = true)\n", + " |-- 32: long (nullable = true)\n", + " |-- 33: long (nullable = true)\n", + " |-- 34: long (nullable = true)\n", + " |-- 35: long (nullable = true)\n", + " |-- 36: long (nullable = true)\n", + " |-- 39: long (nullable = true)\n", + " |-- 40: long (nullable = true)\n", + " |-- 41: long (nullable = true)\n", + " |-- 42: long (nullable = true)\n", + " |-- 43: long (nullable = true)\n", + " |-- 44: long (nullable = true)\n", + " |-- 45: long (nullable = true)\n", + " |-- 46: long (nullable = true)\n", + " |-- 48: long (nullable = true)\n", + " |-- 49: long (nullable = true)\n", + " |-- 51: long (nullable = true)\n", + " |-- 53: long (nullable = true)\n", + " |-- 56: long (nullable = true)\n", + " |-- 57: long (nullable = true)\n", + " |-- 58: long (nullable = true)\n", + " |-- 60: long (nullable = true)\n", + " |-- 61: long (nullable = true)\n", + " |-- 62: long (nullable = true)\n", + " |-- 63: long (nullable = true)\n", + " |-- 64: long (nullable = true)\n", + " |-- 65: long (nullable = true)\n", + " |-- 66: long (nullable = true)\n", + " |-- 67: long (nullable = true)\n", + " |-- 68: long (nullable = true)\n", + " |-- 69: long (nullable = true)\n", + " |-- 70: long (nullable = true)\n", + " |-- 71: long (nullable = true)\n", + " |-- 72: long (nullable = true)\n", + " |-- 73: long (nullable = true)\n", + " |-- 75: long (nullable = true)\n", + " |-- 76: long (nullable = true)\n", + " |-- 77: long (nullable = true)\n", "\n" ] } ], "source": [ "df_weather = df_weather.select('name', 'datetime', 'temp','precip','snow','snowdepth','sunset')\n", - "df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], \"yyyy-MM-dd\"))\n", + "#df_weather = df_weather.withColumn('datetime',F.to_date(df_weather['datetime'], \"yyyy-MM-dd\"))\n", "df_weather.printSchema()\n", + "pivoted_df.printSchema()\n", "# name, datetime, temp, precip, snow, snowdepth, sunset.\n", "# merge on datetime- keep datetime as part of the pivot. " ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 27, "id": "c2d5f67e-1122-4208-b82a-b7fc8ef7f04d", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- day: integer (nullable = true)\n", + " |-- month: integer (nullable = true)\n", + " |-- year: integer (nullable = true)\n", + " |-- 1: long (nullable = true)\n", + " |-- 2: long (nullable = true)\n", + " |-- 3: long (nullable = true)\n", + " |-- 4: long (nullable = true)\n", + " |-- 5: long (nullable = true)\n", + " |-- 6: long (nullable = true)\n", + " |-- 7: long (nullable = true)\n", + " |-- 8: long (nullable = true)\n", + " |-- 11: long (nullable = true)\n", + " |-- 13: long (nullable = true)\n", + " |-- 14: long (nullable = true)\n", + " |-- 15: long (nullable = true)\n", + " |-- 16: long (nullable = true)\n", + " |-- 17: long (nullable = true)\n", + " |-- 19: long (nullable = true)\n", + " |-- 21: long (nullable = true)\n", + " |-- 22: long (nullable = true)\n", + " |-- 23: long (nullable = true)\n", + " |-- 24: long (nullable = true)\n", + " |-- 25: long (nullable = true)\n", + " |-- 26: long (nullable = true)\n", + " |-- 27: long (nullable = true)\n", + " |-- 28: long (nullable = true)\n", + " |-- 29: long (nullable = true)\n", + " |-- 30: long (nullable = true)\n", + " |-- 31: long (nullable = true)\n", + " |-- 32: long (nullable = true)\n", + " |-- 33: long (nullable = true)\n", + " |-- 34: long (nullable = true)\n", + " |-- 35: long (nullable = true)\n", + " |-- 36: long (nullable = true)\n", + " |-- 39: long (nullable = true)\n", + " |-- 40: long (nullable = true)\n", + " |-- 41: long (nullable = true)\n", + " |-- 42: long (nullable = true)\n", + " |-- 43: long (nullable = true)\n", + " |-- 44: long (nullable = true)\n", + " |-- 45: long (nullable = true)\n", + " |-- 46: long (nullable = true)\n", + " |-- 48: long (nullable = true)\n", + " |-- 49: long (nullable = true)\n", + " |-- 51: long (nullable = true)\n", + " |-- 53: long (nullable = true)\n", + " |-- 56: long (nullable = true)\n", + " |-- 57: long (nullable = true)\n", + " |-- 58: long (nullable = true)\n", + " |-- 60: long (nullable = true)\n", + " |-- 61: long (nullable = true)\n", + " |-- 62: long (nullable = true)\n", + " |-- 63: long (nullable = true)\n", + " |-- 64: long (nullable = true)\n", + " |-- 65: long (nullable = true)\n", + " |-- 66: long (nullable = true)\n", + " |-- 67: long (nullable = true)\n", + " |-- 68: long (nullable = true)\n", + " |-- 69: long (nullable = true)\n", + " |-- 70: long (nullable = true)\n", + " |-- 71: long (nullable = true)\n", + " |-- 72: long (nullable = true)\n", + " |-- 73: long (nullable = true)\n", + " |-- 75: long (nullable = true)\n", + " |-- 76: long (nullable = true)\n", + " |-- 77: long (nullable = true)\n", + " |-- datetime: date (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "pivoted_df = pivoted_df.withColumn('datetime', F.to_date(\n", + " F.concat_ws('-', F.col('year'), F.col('month'), F.col('day')),\n", + " 'yyyy-MM-dd'\n", + "))\n", + "pivoted_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "4d80ab44-61f4-4ef0-9209-7c05710e9023", + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the 'day', 'month', and 'year' columns since have datetime now\n", + "pivoted_df = pivoted_df.drop('day', 'month', 'year')\n", + "\n", + "# Reorder the columns- datetime, in program rides and then other community areas\n", + "column_order = ['datetime', '39', '41', '42'] + [col for col in pivoted_df.columns if col not in ['datetime', '39', '41', '42']]\n", + "\n", + "pivoted_df = pivoted_df.select(column_order)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "408996cf-b9ff-4f5c-b6c8-8a5211eb3a2c", + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "23/11/22 22:25:03 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 0.0 in stage 79.0 (TID 4836) (hub-msca-bdp-dphub-students-harshpachisia-sw-jprz.c.msca-bdp-student-ap.internal executor 13): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2022-1-7' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.\n", + "23/11/24 16:34:38 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 0.0 in stage 186.0 (TID 13842) (hub-msca-bdp-dphub-students-harshpachisia-w-1.c.msca-bdp-student-ap.internal executor 9): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2021-10-5' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.\n", "\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)\n", "\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)\n", "\tat scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)\n", @@ -1905,7 +1408,7 @@ "\tat scala.Option.getOrElse(Option.scala:189)\n", "\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)\n", "\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.GetTimestamp_0$(Unknown Source)\n", - "\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_29$(Unknown Source)\n", + "\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_32$(Unknown Source)\n", "\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)\n", "\tat org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)\n", "\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:417)\n", @@ -1924,75 +1427,31 @@ "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", "\tat java.lang.Thread.run(Thread.java:750)\n", - "Caused by: java.time.format.DateTimeParseException: Text '2022-1-7' could not be parsed at index 5\n", + "Caused by: java.time.format.DateTimeParseException: Text '2021-10-5' could not be parsed at index 8\n", "\tat java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1949)\n", "\tat java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)\n", "\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)\n", "\t... 23 more\n", "\n", - "23/11/22 22:25:04 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 0.2 in stage 79.0 (TID 4838) (hub-msca-bdp-dphub-students-harshpachisia-w-1.c.msca-bdp-student-ap.internal executor 19): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2022-8-1' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.\n", - "\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)\n", - "\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)\n", - "\tat scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)\n", - "\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:86)\n", - "\tat scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)\n", - "\tat scala.Option.getOrElse(Option.scala:189)\n", - "\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)\n", - "\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.GetTimestamp_0$(Unknown Source)\n", - "\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_29$(Unknown Source)\n", - "\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)\n", - "\tat org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)\n", - "\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:417)\n", - "\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:82)\n", - "\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:346)\n", - "\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)\n", - "\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)\n", - "\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n", - "\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n", - "\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n", - "\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n", - "\tat org.apache.spark.scheduler.Task.run(Task.scala:131)\n", - "\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:505)\n", - "\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)\n", - "\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:508)\n", - "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n", - "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n", - "\tat java.lang.Thread.run(Thread.java:750)\n", - "Caused by: java.time.format.DateTimeParseException: Text '2022-8-1' could not be parsed at index 5\n", - "\tat java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1949)\n", - "\tat java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)\n", - "\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)\n", - "\t... 23 more\n", - "\n", - "23/11/22 22:25:05 ERROR org.apache.spark.scheduler.TaskSetManager: Task 0 in stage 79.0 failed 10 times; aborting job\n" - ] - }, - { - "ename": "Py4JJavaError", - "evalue": "An error occurred while calling o503.showString.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 79.0 failed 10 times, most recent failure: Lost task 0.9 in stage 79.0 (TID 4845) (hub-msca-bdp-dphub-students-harshpachisia-w-1.c.msca-bdp-student-ap.internal executor 19): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2022-8-1' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)\n\tat scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:86)\n\tat scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)\n\tat scala.Option.getOrElse(Option.scala:189)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.GetTimestamp_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_29$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)\n\tat org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:417)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:82)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:346)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:131)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:505)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:508)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:750)\nCaused by: java.time.format.DateTimeParseException: Text '2022-8-1' could not be parsed at index 5\n\tat java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1949)\n\tat java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)\n\t... 23 more\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2304)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2253)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2252)\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2252)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1124)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1124)\n\tat scala.Option.foreach(Option.scala:407)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1124)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2491)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2433)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2422)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:902)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2204)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2225)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2244)\n\tat org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)\n\tat org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)\n\tat org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)\n\tat org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:278)\n\tat org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3709)\n\tat org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2735)\n\tat org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3700)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3698)\n\tat org.apache.spark.sql.Dataset.head(Dataset.scala:2735)\n\tat org.apache.spark.sql.Dataset.take(Dataset.scala:2942)\n\tat org.apache.spark.sql.Dataset.getRows(Dataset.scala:302)\n\tat org.apache.spark.sql.Dataset.showString(Dataset.scala:339)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:750)\nCaused by: org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2022-8-1' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)\n\tat scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:86)\n\tat scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)\n\tat scala.Option.getOrElse(Option.scala:189)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.GetTimestamp_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_29$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)\n\tat org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:417)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:82)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:346)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:131)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:505)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:508)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\nCaused by: java.time.format.DateTimeParseException: Text '2022-8-1' could not be parsed at index 5\n\tat java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1949)\n\tat java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)\n\t... 23 more\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[20], line 11\u001b[0m\n\u001b[1;32m 5\u001b[0m pivoted_df_with_date \u001b[38;5;241m=\u001b[39m pivoted_df\u001b[38;5;241m.\u001b[39mwithColumn(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdate_of_rides\u001b[39m\u001b[38;5;124m'\u001b[39m, to_date(\n\u001b[1;32m 6\u001b[0m concat_ws(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m'\u001b[39m, col(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m), col(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmonth\u001b[39m\u001b[38;5;124m'\u001b[39m), col(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m'\u001b[39m)),\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myyyy-MM-dd\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 8\u001b[0m ))\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Show the DataFrame with the new date column\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m \u001b[43mpivoted_df_with_date\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/dataframe.py:484\u001b[0m, in \u001b[0;36mDataFrame.show\u001b[0;34m(self, n, truncate, vertical)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Prints the first ``n`` rows to the console.\u001b[39;00m\n\u001b[1;32m 442\u001b[0m \n\u001b[1;32m 443\u001b[0m \u001b[38;5;124;03m.. versionadded:: 1.3.0\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[38;5;124;03m name | Bob\u001b[39;00m\n\u001b[1;32m 482\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(truncate, \u001b[38;5;28mbool\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m truncate:\n\u001b[0;32m--> 484\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshowString\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m20\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvertical\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 485\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jdf\u001b[38;5;241m.\u001b[39mshowString(n, \u001b[38;5;28mint\u001b[39m(truncate), vertical))\n", - "File \u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py:1304\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1298\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1300\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1301\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1303\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1304\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1308\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/utils.py:111\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeco\u001b[39m(\u001b[38;5;241m*\u001b[39ma, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw):\n\u001b[1;32m 110\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 111\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m py4j\u001b[38;5;241m.\u001b[39mprotocol\u001b[38;5;241m.\u001b[39mPy4JJavaError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n", - "File \u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py:326\u001b[0m, in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 324\u001b[0m value \u001b[38;5;241m=\u001b[39m OUTPUT_CONVERTER[\u001b[38;5;28mtype\u001b[39m](answer[\u001b[38;5;241m2\u001b[39m:], gateway_client)\n\u001b[1;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m answer[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m REFERENCE_TYPE:\n\u001b[0;32m--> 326\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Py4JJavaError(\n\u001b[1;32m 327\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m 328\u001b[0m \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name), value)\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Py4JError(\n\u001b[1;32m 331\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m. Trace:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{3}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name, value))\n", - "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o503.showString.\n: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 79.0 failed 10 times, most recent failure: Lost task 0.9 in stage 79.0 (TID 4845) (hub-msca-bdp-dphub-students-harshpachisia-w-1.c.msca-bdp-student-ap.internal executor 19): org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2022-8-1' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)\n\tat scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:86)\n\tat scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)\n\tat scala.Option.getOrElse(Option.scala:189)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.GetTimestamp_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_29$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)\n\tat org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:417)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:82)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:346)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:131)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:505)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:508)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:750)\nCaused by: java.time.format.DateTimeParseException: Text '2022-8-1' could not be parsed at index 5\n\tat java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1949)\n\tat java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)\n\t... 23 more\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2304)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2253)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2252)\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2252)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1124)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1124)\n\tat scala.Option.foreach(Option.scala:407)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1124)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2491)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2433)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2422)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:902)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2204)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2225)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2244)\n\tat org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)\n\tat org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)\n\tat org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)\n\tat org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:278)\n\tat org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3709)\n\tat org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2735)\n\tat org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3700)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3698)\n\tat org.apache.spark.sql.Dataset.head(Dataset.scala:2735)\n\tat org.apache.spark.sql.Dataset.take(Dataset.scala:2942)\n\tat org.apache.spark.sql.Dataset.getRows(Dataset.scala:302)\n\tat org.apache.spark.sql.Dataset.showString(Dataset.scala:339)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:750)\nCaused by: org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2022-8-1' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:150)\n\tat org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkParsedDiff$1.applyOrElse(DateTimeFormatterHelper.scala:141)\n\tat scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:86)\n\tat scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)\n\tat scala.Option.getOrElse(Option.scala:189)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.parse(TimestampFormatter.scala:77)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.GetTimestamp_0$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_29$(Unknown Source)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)\n\tat org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$5(AggregationIterator.scala:259)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:417)\n\tat org.apache.spark.sql.execution.aggregate.TungstenAggregationIterator.next(TungstenAggregationIterator.scala:82)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:346)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)\n\tat org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:131)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:505)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:508)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\nCaused by: java.time.format.DateTimeParseException: Text '2022-8-1' could not be parsed at index 5\n\tat java.time.format.DateTimeFormatter.parseResolved0(DateTimeFormatter.java:1949)\n\tat java.time.format.DateTimeFormatter.parse(DateTimeFormatter.java:1777)\n\tat org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.$anonfun$parse$1(TimestampFormatter.scala:78)\n\t... 23 more\n" + "23/11/24 16:34:38 ERROR org.apache.spark.scheduler.TaskSetManager: Task 0 in stage 186.0 failed 10 times; aborting job\n" ] } ], "source": [ - "# FIX THIS AND THEN MERGE\n", - "# Create a new column 'date_of_rides' by combining day, month, and year columns\n", - "pivoted_df_with_date = pivoted_df.withColumn('date_of_rides', F.to_date(\n", - " F.concat_ws('-', F.col('year'), F.col('month'), F.col('day')),\n", - " 'yyyy-MM-dd'\n", - "))\n", + "df_weather = df_weather.withColumn('datetime', F.to_date('datetime', 'yyyy-MM-dd'))\n", "\n", - "# Show the DataFrame with the new date column\n", - "pivoted_df_with_date.show()" + "# Perform the left join with weather data\n", + "merged_df = pivoted_df.join(df_weather, on='datetime', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e325a6ba-0463-43db-8275-5708fb3817bc", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df.drop('datetime') #since no longer needed. " ] }, {