From dda6743cd759adccbe33b5431dff317a88e9c8b4 Mon Sep 17 00:00:00 2001 From: ridhi96 Date: Wed, 29 Nov 2023 19:23:18 +0000 Subject: [PATCH] deleted geospatial_eda notebook --- geospatial_eda.ipynb | 768 ------------------------------------------- 1 file changed, 768 deletions(-) delete mode 100644 geospatial_eda.ipynb diff --git a/geospatial_eda.ipynb b/geospatial_eda.ipynb deleted file mode 100644 index 60d544c..0000000 --- a/geospatial_eda.ipynb +++ /dev/null @@ -1,768 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "fb040c78-f159-4850-8946-27e19dc6729f", - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession\n", - "from pyspark.sql import functions as F\n", - "\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "6dd5c56f-f351-4e2c-bdd3-653a48e50868", - "metadata": {}, - "outputs": [], - "source": [ - "from google.cloud import bigquery\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "6a1c7c18-c769-47f7-9ee1-88a17d929624", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('spark.stage.maxConsecutiveAttempts', '10'),\n", - " ('spark.dynamicAllocation.minExecutors', '1'),\n", - " ('spark.eventLog.enabled', 'true'),\n", - " ('spark.submit.pyFiles',\n", - " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", - " ('spark.executor.memory', '5739m'),\n", - " ('spark.kryoserializer.buffer.max', '2000M'),\n", - " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", - " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", - " ('spark.driver.maxResultSize', '0'),\n", - " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", - " ('spark.sql.autoBroadcastJoinThreshold', '43m'),\n", - " ('spark.history.fs.logDirectory',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/479dade3-720d-4118-bb28-da417fb33080/spark-job-history'),\n", - " ('spark.ui.filters',\n", - " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", - " ('spark.metrics.namespace',\n", - " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", - " 'hub-msca-bdp-dphub-students-test-ridhi-m'),\n", - " ('spark.dataproc.sql.optimizer.leftsemijoin.conversion.enabled', 'true'),\n", - " ('spark.hadoop.hive.execution.engine', 'mr'),\n", - " ('spark.executor.id', 'driver'),\n", - " ('spark.driver.host',\n", - " 'hub-msca-bdp-dphub-students-test-ridhi-m.c.msca-bdp-student-ap.internal'),\n", - " ('spark.app.name', 'PySparkShell'),\n", - " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", - " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", - " ('spark.yarn.dist.pyFiles',\n", - " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.yarn.am.attemptFailuresValidityInterval', '1h'),\n", - " ('spark.sql.catalogImplementation', 'hive'),\n", - " ('spark.executorEnv.OPENBLAS_NUM_THREADS', '1'),\n", - " ('spark.yarn.secondary.jars',\n", - " 'com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,com.typesafe_config-1.4.2.jar,org.rocksdb_rocksdbjni-6.29.5.jar,com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,com.github.universal-automata_liblevenshtein-3.0.0.jar,com.google.cloud_google-cloud-storage-2.16.0.jar,com.navigamez_greex-1.0.jar,com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,it.unimi.dsi_fastutil-7.0.12.jar,org.projectlombok_lombok-1.16.8.jar,com.google.guava_guava-31.1-jre.jar,com.google.guava_failureaccess-1.0.1.jar,com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,com.google.errorprone_error_prone_annotations-2.16.jar,com.google.j2objc_j2objc-annotations-1.3.jar,com.google.http-client_google-http-client-1.42.3.jar,io.opencensus_opencensus-contrib-http-util-0.31.1.jar,com.google.http-client_google-http-client-jackson2-1.42.3.jar,com.google.http-client_google-http-client-gson-1.42.3.jar,com.google.api-client_google-api-client-2.1.1.jar,commons-codec_commons-codec-1.15.jar,com.google.oauth-client_google-oauth-client-1.34.1.jar,com.google.http-client_google-http-client-apache-v2-1.42.3.jar,com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,com.google.code.gson_gson-2.10.jar,com.google.cloud_google-cloud-core-2.9.0.jar,com.google.auto.value_auto-value-annotations-1.10.1.jar,com.google.cloud_google-cloud-core-http-2.9.0.jar,com.google.http-client_google-http-client-appengine-1.42.3.jar,com.google.api_gax-httpjson-0.105.1.jar,com.google.cloud_google-cloud-core-grpc-2.9.0.jar,io.grpc_grpc-core-1.51.0.jar,com.google.api_gax-2.20.1.jar,com.google.api_gax-grpc-2.20.1.jar,io.grpc_grpc-alts-1.51.0.jar,io.grpc_grpc-grpclb-1.51.0.jar,org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,io.grpc_grpc-protobuf-1.51.0.jar,com.google.auth_google-auth-library-credentials-1.13.0.jar,com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,com.google.api_api-common-2.2.2.jar,javax.annotation_javax.annotation-api-1.3.2.jar,io.opencensus_opencensus-api-0.31.1.jar,io.grpc_grpc-context-1.51.0.jar,com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,com.google.protobuf_protobuf-java-3.21.10.jar,com.google.protobuf_protobuf-java-util-3.21.10.jar,com.google.api.grpc_proto-google-common-protos-2.11.0.jar,org.threeten_threetenbp-1.6.4.jar,com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,com.fasterxml.jackson.core_jackson-core-2.14.1.jar,com.google.code.findbugs_jsr305-3.0.2.jar,io.grpc_grpc-api-1.51.0.jar,io.grpc_grpc-auth-1.51.0.jar,io.grpc_grpc-stub-1.51.0.jar,org.checkerframework_checker-qual-3.28.0.jar,com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,io.grpc_grpc-protobuf-lite-1.51.0.jar,com.google.android_annotations-4.1.1.4.jar,org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,io.grpc_grpc-netty-shaded-1.51.0.jar,io.perfmark_perfmark-api-0.26.0.jar,io.grpc_grpc-googleapis-1.51.0.jar,io.grpc_grpc-xds-1.51.0.jar,io.opencensus_opencensus-proto-0.2.0.jar,io.grpc_grpc-services-1.51.0.jar,com.google.re2j_re2j-1.6.jar,dk.brics.automaton_automaton-1.11-8.jar,org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.repl.local.jars',\n", - " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.sql.cbo.enabled', 'true'),\n", - " ('spark.executorEnv.PYTHONPATH',\n", - " '/usr/lib/spark/python/lib/py4j-0.10.9-src.zip:/usr/lib/spark/python/:{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip{{PWD}}/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar{{PWD}}/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar{{PWD}}/com.typesafe_config-1.4.2.jar{{PWD}}/org.rocksdb_rocksdbjni-6.29.5.jar{{PWD}}/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar{{PWD}}/com.github.universal-automata_liblevenshtein-3.0.0.jar{{PWD}}/com.google.cloud_google-cloud-storage-2.16.0.jar{{PWD}}/com.navigamez_greex-1.0.jar{{PWD}}/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar{{PWD}}/it.unimi.dsi_fastutil-7.0.12.jar{{PWD}}/org.projectlombok_lombok-1.16.8.jar{{PWD}}/com.google.guava_guava-31.1-jre.jar{{PWD}}/com.google.guava_failureaccess-1.0.1.jar{{PWD}}/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar{{PWD}}/com.google.errorprone_error_prone_annotations-2.16.jar{{PWD}}/com.google.j2objc_j2objc-annotations-1.3.jar{{PWD}}/com.google.http-client_google-http-client-1.42.3.jar{{PWD}}/io.opencensus_opencensus-contrib-http-util-0.31.1.jar{{PWD}}/com.google.http-client_google-http-client-jackson2-1.42.3.jar{{PWD}}/com.google.http-client_google-http-client-gson-1.42.3.jar{{PWD}}/com.google.api-client_google-api-client-2.1.1.jar{{PWD}}/commons-codec_commons-codec-1.15.jar{{PWD}}/com.google.oauth-client_google-oauth-client-1.34.1.jar{{PWD}}/com.google.http-client_google-http-client-apache-v2-1.42.3.jar{{PWD}}/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar{{PWD}}/com.google.code.gson_gson-2.10.jar{{PWD}}/com.google.cloud_google-cloud-core-2.9.0.jar{{PWD}}/com.google.auto.value_auto-value-annotations-1.10.1.jar{{PWD}}/com.google.cloud_google-cloud-core-http-2.9.0.jar{{PWD}}/com.google.http-client_google-http-client-appengine-1.42.3.jar{{PWD}}/com.google.api_gax-httpjson-0.105.1.jar{{PWD}}/com.google.cloud_google-cloud-core-grpc-2.9.0.jar{{PWD}}/io.grpc_grpc-core-1.51.0.jar{{PWD}}/com.google.api_gax-2.20.1.jar{{PWD}}/com.google.api_gax-grpc-2.20.1.jar{{PWD}}/io.grpc_grpc-alts-1.51.0.jar{{PWD}}/io.grpc_grpc-grpclb-1.51.0.jar{{PWD}}/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar{{PWD}}/io.grpc_grpc-protobuf-1.51.0.jar{{PWD}}/com.google.auth_google-auth-library-credentials-1.13.0.jar{{PWD}}/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar{{PWD}}/com.google.api_api-common-2.2.2.jar{{PWD}}/javax.annotation_javax.annotation-api-1.3.2.jar{{PWD}}/io.opencensus_opencensus-api-0.31.1.jar{{PWD}}/io.grpc_grpc-context-1.51.0.jar{{PWD}}/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar{{PWD}}/com.google.protobuf_protobuf-java-3.21.10.jar{{PWD}}/com.google.protobuf_protobuf-java-util-3.21.10.jar{{PWD}}/com.google.api.grpc_proto-google-common-protos-2.11.0.jar{{PWD}}/org.threeten_threetenbp-1.6.4.jar{{PWD}}/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar{{PWD}}/com.fasterxml.jackson.core_jackson-core-2.14.1.jar{{PWD}}/com.google.code.findbugs_jsr305-3.0.2.jar{{PWD}}/io.grpc_grpc-api-1.51.0.jar{{PWD}}/io.grpc_grpc-auth-1.51.0.jar{{PWD}}/io.grpc_grpc-stub-1.51.0.jar{{PWD}}/org.checkerframework_checker-qual-3.28.0.jar{{PWD}}/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar{{PWD}}/io.grpc_grpc-protobuf-lite-1.51.0.jar{{PWD}}/com.google.android_annotations-4.1.1.4.jar{{PWD}}/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar{{PWD}}/io.grpc_grpc-netty-shaded-1.51.0.jar{{PWD}}/io.perfmark_perfmark-api-0.26.0.jar{{PWD}}/io.grpc_grpc-googleapis-1.51.0.jar{{PWD}}/io.grpc_grpc-xds-1.51.0.jar{{PWD}}/io.opencensus_opencensus-proto-0.2.0.jar{{PWD}}/io.grpc_grpc-services-1.51.0.jar{{PWD}}/com.google.re2j_re2j-1.6.jar{{PWD}}/dk.brics.automaton_automaton-1.11-8.jar{{PWD}}/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.yarn.dist.jars',\n", - " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.dataproc.sql.parquet.enableFooterCache', 'true'),\n", - " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", - " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", - " ('spark.eventLog.dir',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/479dade3-720d-4118-bb28-da417fb33080/spark-job-history'),\n", - " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", - " 'hub-msca-bdp-dphub-students-test-ridhi-m'),\n", - " ('spark.driver.port', '43057'),\n", - " ('spark.yarn.am.memory', '640m'),\n", - " ('spark.jars.packages',\n", - " 'com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0,graphframes:graphframes:0.8.2-spark3.1-s_2.12'),\n", - " ('spark.executor.instances', '2'),\n", - " ('spark.dataproc.listeners',\n", - " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", - " ('spark.serializer.objectStreamReset', '100'),\n", - " ('spark.app.id', 'application_1700243096505_0005'),\n", - " ('spark.submit.deployMode', 'client'),\n", - " ('spark.app.startTime', '1700261586992'),\n", - " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", - " 'http://hub-msca-bdp-dphub-students-test-ridhi-m:8088/proxy/application_1700243096505_0005'),\n", - " ('spark.shuffle.service.enabled', 'true'),\n", - " ('spark.driver.appUIAddress',\n", - " 'http://hub-msca-bdp-dphub-students-test-ridhi-m.c.msca-bdp-student-ap.internal:44547'),\n", - " ('spark.scheduler.mode', 'FAIR'),\n", - " ('spark.sql.adaptive.enabled', 'true'),\n", - " ('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),\n", - " ('spark.scheduler.minRegisteredResourcesRatio', '0.0'),\n", - " ('spark.executor.cores', '2'),\n", - " ('spark.master', 'yarn'),\n", - " ('spark.ui.port', '0'),\n", - " ('spark.rpc.message.maxSize', '512'),\n", - " ('spark.rdd.compress', 'True'),\n", - " ('spark.task.maxFailures', '10'),\n", - " ('spark.driver.memory', '3840m'),\n", - " ('spark.dynamicAllocation.enabled', 'true'),\n", - " ('spark.yarn.isPython', 'true'),\n", - " ('spark.ui.proxyBase', '/proxy/application_1700243096505_0005'),\n", - " ('spark.yarn.historyServer.address',\n", - " 'hub-msca-bdp-dphub-students-test-ridhi-m:18080'),\n", - " ('spark.ui.showConsoleProgress', 'true')]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark = SparkSession.builder.appName('geoEDA').getOrCreate()\n", - "spark.sparkContext.getConf().getAll()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e267adf2-b7a9-44b6-93ac-cf06344f07be", - "metadata": {}, - "outputs": [], - "source": [ - "bq_client = bigquery.Client(project='msca-bdp-student-ap')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4136539d-0a5b-4acc-97ba-6d0d43a5d5d2", - "metadata": {}, - "outputs": [ - { - "ename": "AnalysisException", - "evalue": "Undefined function: 'ST_Contains'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[8], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m community_areas \u001b[38;5;241m=\u001b[39m spark\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigquery\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39moption(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtable\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchicago_rideshare.community_areas\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mload()\n\u001b[1;32m 5\u001b[0m ride_locations \u001b[38;5;241m=\u001b[39m spark\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigquery\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39moption(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtable\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchicago_rideshare.geo_rides_2018\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mload()\n\u001b[0;32m----> 7\u001b[0m result_df \u001b[38;5;241m=\u001b[39m \u001b[43mcommunity_areas\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mride_locations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mST_Contains(ST_GeomFromWKT(community_areas.geometry), ST_GeomFromWKT(ride_locations.pickup_geo))\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mgroupBy(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcommunity_areas.community\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39magg(count(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m*\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39malias(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_rides\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 12\u001b[0m result_df \u001b[38;5;241m=\u001b[39m result_df\u001b[38;5;241m.\u001b[39morderBy(desc(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_rides\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 14\u001b[0m result_df\u001b[38;5;241m.\u001b[39mshow()\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/dataframe.py:1339\u001b[0m, in \u001b[0;36mDataFrame.join\u001b[0;34m(self, other, on, how)\u001b[0m\n\u001b[1;32m 1337\u001b[0m on \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jseq([])\n\u001b[1;32m 1338\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(how, \u001b[38;5;28mstr\u001b[39m), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhow should be a string\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1339\u001b[0m jdf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1340\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DataFrame(jdf, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msql_ctx)\n", - "File \u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py:1304\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1298\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1300\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1301\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1303\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1304\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1308\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", - "\u001b[0;31mAnalysisException\u001b[0m: Undefined function: 'ST_Contains'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0" - ] - } - ], - "source": [ - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import count, desc, expr\n", - "\n", - "community_areas = spark.read.format(\"bigquery\").option(\"table\", \"chicago_rideshare.community_areas\").load()\n", - "ride_locations = spark.read.format(\"bigquery\").option(\"table\", \"chicago_rideshare.geo_rides_2018\").load()\n", - "\n", - "result_df = community_areas.join(\n", - " ride_locations,\n", - " expr(\"ST_Contains(ST_GeomFromWKT(community_areas.geometry), ST_GeomFromWKT(ride_locations.pickup_geo))\")\n", - ").groupBy(\"community_areas.community\").agg(count(\"*\").alias(\"num_rides\"))\n", - "\n", - "result_df = result_df.orderBy(desc(\"num_rides\"))\n", - "\n", - "result_df.show()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "253cad98-b468-41c3-82fb-8014779380f3", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Stage 0:> (0 + 1) / 1]\r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------+-------+----------+----------+----------+---------+----+--------------------+---------------+---------------+\n", - "| shape_len|comarea|comarea_id|area_numbe|area_num_1|perimeter|area| geometry| shape_area| community|\n", - "+-------------+-------+----------+----------+----------+---------+----+--------------------+---------------+---------------+\n", - "|31027.0545098| 0| 0| 35| 35| 0| 0|POLYGON((-87.6093...|4.60046211581E7| DOUGLAS|\n", - "|19565.5061533| 0| 0| 36| 36| 0| 0|POLYGON((-87.5921...|1.69139610408E7| OAKLAND|\n", - "|25339.0897503| 0| 0| 37| 37| 0| 0|POLYGON((-87.6288...|1.99167048692E7| FULLER PARK|\n", - "|28196.8371573| 0| 0| 38| 38| 0| 0|POLYGON((-87.6067...|4.84925031554E7|GRAND BOULEVARD|\n", - "|23325.1679062| 0| 0| 39| 39| 0| 0|POLYGON((-87.5923...|2.90717419283E7| KENWOOD|\n", - "+-------------+-------+----------+----------+----------+---------+----+--------------------+---------------+---------------+\n", - "\n", - "root\n", - " |-- shape_len: double (nullable = true)\n", - " |-- comarea: long (nullable = true)\n", - " |-- comarea_id: long (nullable = true)\n", - " |-- area_numbe: long (nullable = true)\n", - " |-- area_num_1: long (nullable = true)\n", - " |-- perimeter: long (nullable = true)\n", - " |-- area: long (nullable = true)\n", - " |-- geometry: string (nullable = true)\n", - " |-- shape_area: double (nullable = true)\n", - " |-- community: string (nullable = true)\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - } - ], - "source": [ - "bucket = \"msca-bdp-student-gcs\"\n", - "spark.conf.set('temporaryGcsBucket', bucket)\n", - "\n", - "# Load data from BigQuery.\n", - "df = spark.read.format('bigquery') \\\n", - " .option('table', 'msca-bdp-student-ap.chicago_rideshare.community_areas') \\\n", - " .load()\n", - "df.createOrReplaceTempView('community')\n", - "\n", - "df2 = spark.sql(\n", - " 'SELECT * FROM community LIMIT 5')\n", - "df2.show()\n", - "df2.printSchema()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "93b5dd39-66c4-46a6-b61b-323186c13e68", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c79654d6ff4d43f0ad26722c526ac6f9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Query is running: 0%| |" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%%bigquery df\n", - "\n", - "CREATE OR REPLACE TABLE chicago_rideshare.temp2 AS\n", - "\n", - "WITH joined_data AS (\n", - " SELECT \n", - " COUNT(*) AS num_rides,\n", - " community_areas.community,\n", - " community_areas.area_num_1\n", - " FROM `chicago_rideshare.community_areas` AS community_areas\n", - " INNER JOIN `chicago_rideshare.geo_rides_2018` AS ride_locations \n", - " ON ST_CONTAINS(community_areas.geometry, ride_locations.pickup_geo)\n", - " GROUP BY community_areas.community, community_areas.area_num_1\n", - " )\n", - "\n", - "SELECT \n", - " j.num_rides,\n", - " j.community,\n", - " j.area_num_1,\n", - " c.geometry \n", - "FROM joined_data j\n", - " INNER JOIN `chicago_rideshare.community_areas` c\n", - "ON j.community = c.community\n", - " ORDER BY j.num_rides DESC" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "2efd32f7-498d-408d-bfb5-da3fde6ff81c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head(20)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "c200551b-4931-4c1d-aa84-a6630247683f", - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'DataFrame' object has no attribute 'write'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_20818/773907677.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"bigquery\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'overwrite'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"table\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"chicago_rideshare.temp2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5461\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5462\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5463\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5464\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5465\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'write'" - ] - } - ], - "source": [ - "df.write.format(\"bigquery\")\\\n", - " .mode('overwrite')\\\n", - " .option(\"table\",\"chicago_rideshare.temp2\")\\\n", - " .save()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "4b00739a-8090-4672-9ff6-ba14b5a2ac03", - "metadata": {}, - "outputs": [], - "source": [ - "query = bq_client.query(\"\"\"\n", - " \n", - " WITH joined_data AS (\n", - " SELECT \n", - " COUNT(*) AS num_rides,\n", - " community_areas.community,\n", - " community_areas.area_num_1\n", - " FROM `chicago_rideshare.community_areas` AS community_areas\n", - " INNER JOIN `chicago_rideshare.geo_rides_2018` AS ride_locations \n", - " ON ST_CONTAINS(community_areas.geometry, ride_locations.pickup_geo)\n", - " GROUP BY community_areas.community, community_areas.area_num_1\n", - " )\n", - " \n", - " SELECT \n", - " j.num_rides,\n", - " j.community,\n", - " j.area_num_1,\n", - " c.geometry \n", - " FROM joined_data j\n", - " INNER JOIN `chicago_rideshare.community_areas` c\n", - " ON j.community = c.community\n", - " ORDER BY j.num_rides DESC\n", - "\"\"\")\n", - "\n", - "\n", - "results = query.result()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "92d6fae4-3e53-4538-9b3b-fb4b0646953a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used\n" - ] - }, - { - "ename": "AnalysisException", - "evalue": "Undefined function: 'ST_AsGeoJSON'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[19], line 8\u001b[0m\n\u001b[1;32m 4\u001b[0m result_df \u001b[38;5;241m=\u001b[39m spark\u001b[38;5;241m.\u001b[39mcreateDataFrame(query\u001b[38;5;241m.\u001b[39mto_dataframe()) \n\u001b[1;32m 6\u001b[0m community_areas_df \u001b[38;5;241m=\u001b[39m spark\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigquery\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39moption(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtable\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchicago_rideshare.community_areas\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mload()\n\u001b[0;32m----> 8\u001b[0m community_areas_df \u001b[38;5;241m=\u001b[39m \u001b[43mcommunity_areas_df\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwithColumn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgeometry_geojson\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mST_AsGeoJSON(geometry)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mselect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcommunity\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgeometry_geojson\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 14\u001b[0m community_areas_df \u001b[38;5;241m=\u001b[39m community_areas_df\u001b[38;5;241m.\u001b[39mselect(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcommunity\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 16\u001b[0m final_result_df \u001b[38;5;241m=\u001b[39m result_df\u001b[38;5;241m.\u001b[39mjoin(\n\u001b[1;32m 17\u001b[0m community_areas_df,\n\u001b[1;32m 18\u001b[0m on\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcommunity\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 19\u001b[0m how\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minner\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 20\u001b[0m )\u001b[38;5;241m.\u001b[39morderBy(col(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_rides\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mdesc())\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/dataframe.py:2455\u001b[0m, in \u001b[0;36mDataFrame.withColumn\u001b[0;34m(self, colName, col)\u001b[0m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2426\u001b[0m \u001b[38;5;124;03mReturns a new :class:`DataFrame` by adding a column or replacing the\u001b[39;00m\n\u001b[1;32m 2427\u001b[0m \u001b[38;5;124;03mexisting column that has the same name.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2452\u001b[0m \n\u001b[1;32m 2453\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2454\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(col, Column), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcol should be Column\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 2455\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DataFrame(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwithColumn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcolName\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcol\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jc\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msql_ctx)\n", - "File \u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py:1304\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1298\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1300\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1301\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1303\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1304\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1308\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", - "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", - "\u001b[0;31mAnalysisException\u001b[0m: Undefined function: 'ST_AsGeoJSON'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0" - ] - } - ], - "source": [ - "\n", - "from pyspark.sql.functions import col, expr\n", - "\n", - "\n", - "result_df = spark.createDataFrame(query.to_dataframe()) \n", - "\n", - "community_areas_df = spark.read.format(\"bigquery\").option(\"table\", \"chicago_rideshare.community_areas\").load()\n", - "\n", - "community_areas_df = community_areas_df.withColumn(\n", - " \"geometry_geojson\",\n", - " expr(\"ST_AsGeoJSON(geometry)\")\n", - ").select(\"community\", \"geometry_geojson\")\n", - "\n", - "\n", - "community_areas_df = community_areas_df.select('community', 'geometry')\n", - "\n", - "final_result_df = result_df.join(\n", - " community_areas_df,\n", - " on=\"community\",\n", - " how=\"inner\"\n", - ").orderBy(col(\"num_rides\").desc())\n", - "\n", - "\n", - "final_result_df.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "e942162b-a384-4df6-83e7-7a3d05434fb0", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d2b4475441ca4680bf0b5a520c47458f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Query is running: 0%| |" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%%bigquery df2\n", - "\n", - "CREATE OR REPLACE TABLE chicago_rideshare.temp AS\n", - "SELECT\n", - " rides.*,\n", - " areas.community, areas.area_num_1, areas.geometry\n", - "FROM\n", - " chicago_rideshare.geo_program_area_2020 rides\n", - "LEFT JOIN\n", - " chicago_rideshare.community_areas areas\n", - "ON\n", - " ST_WITHIN(ST_GEOGPOINT(rides.pickup_lon, rides.pickup_lat), areas.geometry) OR\n", - " ST_WITHIN(ST_GEOGPOINT(rides.dropoff_lon, rides.dropoff_lat), areas.geometry)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "fb07a8d6-08f7-4488-b1f5-29d2e0dcfbbf", - "metadata": {}, - "outputs": [], - "source": [ - "sparkDF=spark.createDataFrame(query.to_dataframe()) \n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "70512bb3-8fc5-467f-a56a-48beb9ec0049", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sparkDF.rdd.getNumPartitions()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "24992ad1-d14d-4759-8c7b-4a10ef24e0b9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Partitions: 2\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "23/11/17 23:33:27 WARN org.apache.spark.scheduler.TaskSetManager: Stage 19 contains a task of very large size (1030 KiB). The maximum recommended task size is 1000 KiB.\n", - "[Stage 19:> (0 + 2) / 2]\r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------+-----+\n", - "|partitionId|count|\n", - "+-----------+-----+\n", - "| 0| 38|\n", - "| 1| 39|\n", - "+-----------+-----+\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - } - ], - "source": [ - "def displaypartitions(df):\n", - " #number of records by partition\n", - " num = df.rdd.getNumPartitions()\n", - " print(\"Partitions:\", num)\n", - " df.withColumn(\"partitionId\", F.spark_partition_id())\\\n", - " .groupBy(\"partitionId\")\\\n", - " .count()\\\n", - " .orderBy(F.asc(\"count\"))\\\n", - " .show(num)\n", - "\n", - "sparkDF.rdd.getNumPartitions()\n", - "displaypartitions(sparkDF)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "966bc4df-a43d-47d0-8a2f-1b2fefad6cc9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---------+---------------+----------+--------------------+\n", - "|num_rides| community|area_num_1| geometry|\n", - "+---------+---------------+----------+--------------------+\n", - "| 2829646|NEAR NORTH SIDE| 8|POLYGON((-87.6247...|\n", - "| 1478619| LOOP| 32|POLYGON((-87.6095...|\n", - "| 1473545| NEAR WEST SIDE| 28|POLYGON((-87.6383...|\n", - "| 1144197| LAKE VIEW| 6|POLYGON((-87.6439...|\n", - "| 1092641| WEST TOWN| 24|POLYGON((-87.6569...|\n", - "| 952820| LINCOLN PARK| 7|POLYGON((-87.6319...|\n", - "| 646320| LOGAN SQUARE| 22|POLYGON((-87.6828...|\n", - "| 366273| UPTOWN| 3|POLYGON((-87.6410...|\n", - "| 322491|NEAR SOUTH SIDE| 33|POLYGON((-87.6091...|\n", - "| 298187| OHARE| 76|MULTIPOLYGON(((-8...|\n", - "| 237934| HYDE PARK| 41|POLYGON((-87.5806...|\n", - "| 232537| EDGEWATER| 77|POLYGON((-87.6557...|\n", - "| 209190| NORTH CENTER| 5|POLYGON((-87.6733...|\n", - "| 205838|LOWER WEST SIDE| 31|POLYGON((-87.6360...|\n", - "| 187756| AVONDALE| 21|POLYGON((-87.6879...|\n", - "| 174799| LINCOLN SQUARE| 4|POLYGON((-87.6744...|\n", - "| 161243| IRVING PARK| 16|POLYGON((-87.6952...|\n", - "| 158621| ROGERS PARK| 1|POLYGON((-87.6545...|\n", - "| 148353| GARFIELD RIDGE| 56|POLYGON((-87.7385...|\n", - "| 135942| AUSTIN| 25|POLYGON((-87.7915...|\n", - "+---------+---------------+----------+--------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "23/11/17 23:24:29 WARN org.apache.spark.scheduler.TaskSetManager: Stage 9 contains a task of very large size (1030 KiB). The maximum recommended task size is 1000 KiB.\n" - ] - } - ], - "source": [ - "sparkDF.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "121a5d26-b6d4-4080-a862-a7a0240306fe", - "metadata": {}, - "outputs": [], - "source": [ - "sparkDF=sparkDF.repartition(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "fab2d926-cc5b-4e58-9f78-e950e720d487", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "23/11/17 23:34:10 WARN org.apache.spark.scheduler.TaskSetManager: Stage 29 contains a task of very large size (1030 KiB). The maximum recommended task size is 1000 KiB.\n", - " \r" - ] - } - ], - "source": [ - "sparkDF.write.format(\"bigquery\")\\\n", - " .mode('overwrite')\\\n", - " .option(\"table\",\"chicago_rideshare.temp2\")\\\n", - " .option('temporaryGcsBucket', \"msca-bdp-student-gcs\" )\\\n", - " .save()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "104fb59b-fbb0-446f-86e0-5fc1081d9358", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "23/11/17 23:34:00 WARN org.apache.spark.scheduler.TaskSetManager: Stage 22 contains a task of very large size (1030 KiB). The maximum recommended task size is 1000 KiB.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Partitions: 10\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "23/11/17 23:34:00 WARN org.apache.spark.scheduler.TaskSetManager: Stage 23 contains a task of very large size (1030 KiB). The maximum recommended task size is 1000 KiB.\n", - "[Stage 28:===============================================> (177 + 3) / 200]\r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------+-----+\n", - "|partitionId|count|\n", - "+-----------+-----+\n", - "| 9| 7|\n", - "| 5| 7|\n", - "| 0| 7|\n", - "| 1| 8|\n", - "| 6| 8|\n", - "| 3| 8|\n", - "| 7| 8|\n", - "| 4| 8|\n", - "| 8| 8|\n", - "| 2| 8|\n", - "+-----------+-----+\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - } - ], - "source": [ - "displaypartitions(sparkDF)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "308f7aa9-0b01-4f01-bff3-c6b4ca19671a", - "metadata": {}, - "outputs": [], - "source": [ - "spark.stop()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a0d32ea-f2c9-45b5-be57-56e29d4327fd", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "PySpark", - "language": "python", - "name": "pyspark" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.15" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}