From f32409ea26172fe8d37d81f44f91b9463a5d69c9 Mon Sep 17 00:00:00 2001 From: Ridhi Purohit Date: Fri, 17 Nov 2023 22:52:11 +0000 Subject: [PATCH] testing geospatial visualization --- geospatial_eda.ipynb | 380 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 313 insertions(+), 67 deletions(-) diff --git a/geospatial_eda.ipynb b/geospatial_eda.ipynb index 2ef39bb..de0cc75 100644 --- a/geospatial_eda.ipynb +++ b/geospatial_eda.ipynb @@ -38,22 +38,20 @@ " ('spark.eventLog.enabled', 'true'),\n", " ('spark.submit.pyFiles',\n", " '/root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,/root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,/root/.ivy2/jars/com.typesafe_config-1.4.2.jar,/root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,/root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,/root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,/root/.ivy2/jars/com.navigamez_greex-1.0.jar,/root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,/root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,/root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,/root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,/root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,/root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,/root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,/root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,/root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,/root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,/root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,/root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,/root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,/root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,/root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,/root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,/root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,/root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,/root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,/root/.ivy2/jars/com.google.api_gax-2.20.1.jar,/root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,/root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,/root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,/root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,/root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,/root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,/root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,/root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,/root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,/root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,/root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,/root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,/root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,/root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,/root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,/root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,/root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,/root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,/root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,/root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,/root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,/root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,/root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,/root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,/root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,/root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,/root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,/root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", - " ('spark.ui.proxyBase', '/proxy/application_1700203141719_0001'),\n", " ('spark.dataproc.sql.joinConditionReorder.enabled', 'true'),\n", " ('spark.executor.memory', '5739m'),\n", " ('spark.kryoserializer.buffer.max', '2000M'),\n", - " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", - " 'http://hub-msca-bdp-dphub-students-test-ridhi-m:8088/proxy/application_1700203141719_0001'),\n", " ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),\n", + " ('spark.driver.appUIAddress',\n", + " 'http://hub-msca-bdp-dphub-students-test-ridhi-m.c.msca-bdp-student-ap.internal:35239'),\n", " ('spark.dataproc.sql.local.rank.pushdown.enabled', 'true'),\n", - " ('spark.history.fs.logDirectory',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/f39e2775-dd3e-42ba-a318-5662cb036b81/spark-job-history'),\n", " ('spark.driver.maxResultSize', '0'),\n", " ('spark.yarn.unmanagedAM.enabled', 'true'),\n", " ('spark.sql.autoBroadcastJoinThreshold', '43m'),\n", + " ('spark.history.fs.logDirectory',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/479dade3-720d-4118-bb28-da417fb33080/spark-job-history'),\n", " ('spark.ui.filters',\n", " 'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),\n", - " ('spark.app.startTime', '1700203413492'),\n", " ('spark.metrics.namespace',\n", " 'app_name:${spark.app.name}.app_id:${spark.app.id}'),\n", " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',\n", @@ -63,6 +61,7 @@ " ('spark.executor.id', 'driver'),\n", " ('spark.driver.host',\n", " 'hub-msca-bdp-dphub-students-test-ridhi-m.c.msca-bdp-student-ap.internal'),\n", + " ('spark.ui.proxyBase', '/proxy/application_1700243096505_0004'),\n", " ('spark.app.name', 'PySparkShell'),\n", " ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', '2'),\n", " ('spark.dynamicAllocation.maxExecutors', '10000'),\n", @@ -81,25 +80,26 @@ " ('spark.yarn.dist.jars',\n", " 'file:///root/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-4.4.0.jar,file:///root/.ivy2/jars/graphframes_graphframes-0.8.2-spark3.1-s_2.12.jar,file:///root/.ivy2/jars/com.typesafe_config-1.4.2.jar,file:///root/.ivy2/jars/org.rocksdb_rocksdbjni-6.29.5.jar,file:///root/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.828.jar,file:///root/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-storage-2.16.0.jar,file:///root/.ivy2/jars/com.navigamez_greex-1.0.jar,file:///root/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.4.4.jar,file:///root/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar,file:///root/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar,file:///root/.ivy2/jars/com.google.guava_guava-31.1-jre.jar,file:///root/.ivy2/jars/com.google.guava_failureaccess-1.0.1.jar,file:///root/.ivy2/jars/com.google.guava_listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar,file:///root/.ivy2/jars/com.google.errorprone_error_prone_annotations-2.16.jar,file:///root/.ivy2/jars/com.google.j2objc_j2objc-annotations-1.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-1.42.3.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-contrib-http-util-0.31.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-jackson2-1.42.3.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-gson-1.42.3.jar,file:///root/.ivy2/jars/com.google.api-client_google-api-client-2.1.1.jar,file:///root/.ivy2/jars/commons-codec_commons-codec-1.15.jar,file:///root/.ivy2/jars/com.google.oauth-client_google-oauth-client-1.34.1.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-apache-v2-1.42.3.jar,file:///root/.ivy2/jars/com.google.apis_google-api-services-storage-v1-rev20220705-2.0.0.jar,file:///root/.ivy2/jars/com.google.code.gson_gson-2.10.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-2.9.0.jar,file:///root/.ivy2/jars/com.google.auto.value_auto-value-annotations-1.10.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-http-2.9.0.jar,file:///root/.ivy2/jars/com.google.http-client_google-http-client-appengine-1.42.3.jar,file:///root/.ivy2/jars/com.google.api_gax-httpjson-0.105.1.jar,file:///root/.ivy2/jars/com.google.cloud_google-cloud-core-grpc-2.9.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-core-1.51.0.jar,file:///root/.ivy2/jars/com.google.api_gax-2.20.1.jar,file:///root/.ivy2/jars/com.google.api_gax-grpc-2.20.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-alts-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-grpclb-1.51.0.jar,file:///root/.ivy2/jars/org.conscrypt_conscrypt-openjdk-uber-2.5.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-1.51.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-credentials-1.13.0.jar,file:///root/.ivy2/jars/com.google.auth_google-auth-library-oauth2-http-1.13.0.jar,file:///root/.ivy2/jars/com.google.api_api-common-2.2.2.jar,file:///root/.ivy2/jars/javax.annotation_javax.annotation-api-1.3.2.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-api-0.31.1.jar,file:///root/.ivy2/jars/io.grpc_grpc-context-1.51.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-3.21.10.jar,file:///root/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.21.10.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-common-protos-2.11.0.jar,file:///root/.ivy2/jars/org.threeten_threetenbp-1.6.4.jar,file:///root/.ivy2/jars/com.google.api.grpc_proto-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.google.api.grpc_gapic-google-cloud-storage-v2-2.16.0-alpha.jar,file:///root/.ivy2/jars/com.fasterxml.jackson.core_jackson-core-2.14.1.jar,file:///root/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.2.jar,file:///root/.ivy2/jars/io.grpc_grpc-api-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-auth-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-stub-1.51.0.jar,file:///root/.ivy2/jars/org.checkerframework_checker-qual-3.28.0.jar,file:///root/.ivy2/jars/com.google.api.grpc_grpc-google-iam-v1-1.6.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-protobuf-lite-1.51.0.jar,file:///root/.ivy2/jars/com.google.android_annotations-4.1.1.4.jar,file:///root/.ivy2/jars/org.codehaus.mojo_animal-sniffer-annotations-1.22.jar,file:///root/.ivy2/jars/io.grpc_grpc-netty-shaded-1.51.0.jar,file:///root/.ivy2/jars/io.perfmark_perfmark-api-0.26.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-googleapis-1.51.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-xds-1.51.0.jar,file:///root/.ivy2/jars/io.opencensus_opencensus-proto-0.2.0.jar,file:///root/.ivy2/jars/io.grpc_grpc-services-1.51.0.jar,file:///root/.ivy2/jars/com.google.re2j_re2j-1.6.jar,file:///root/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar,file:///root/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),\n", " ('spark.dataproc.sql.parquet.enableFooterCache', 'true'),\n", + " ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',\n", + " 'http://hub-msca-bdp-dphub-students-test-ridhi-m:8088/proxy/application_1700243096505_0004'),\n", " ('spark.sql.warehouse.dir', 'file:/spark-warehouse'),\n", " ('spark.yarn.executor.failuresValidityInterval', '1h'),\n", + " ('spark.eventLog.dir',\n", + " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/479dade3-720d-4118-bb28-da417fb33080/spark-job-history'),\n", " ('spark.dataproc.metrics.listener.metrics.collector.hostname',\n", " 'hub-msca-bdp-dphub-students-test-ridhi-m'),\n", " ('spark.yarn.am.memory', '640m'),\n", - " ('spark.eventLog.dir',\n", - " 'gs://dataproc-temp-us-central1-635155370842-uzamlpgc/f39e2775-dd3e-42ba-a318-5662cb036b81/spark-job-history'),\n", - " ('spark.app.id', 'application_1700203141719_0001'),\n", " ('spark.jars.packages',\n", " 'com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0,graphframes:graphframes:0.8.2-spark3.1-s_2.12'),\n", " ('spark.executor.instances', '2'),\n", " ('spark.dataproc.listeners',\n", " 'com.google.cloud.spark.performance.DataprocMetricsListener'),\n", " ('spark.serializer.objectStreamReset', '100'),\n", + " ('spark.driver.port', '45143'),\n", " ('spark.submit.deployMode', 'client'),\n", + " ('spark.app.id', 'application_1700243096505_0004'),\n", " ('spark.sql.cbo.joinReorder.enabled', 'true'),\n", - " ('spark.driver.port', '35287'),\n", - " ('spark.driver.appUIAddress',\n", - " 'http://hub-msca-bdp-dphub-students-test-ridhi-m.c.msca-bdp-student-ap.internal:33815'),\n", + " ('spark.app.startTime', '1700261333050'),\n", " ('spark.shuffle.service.enabled', 'true'),\n", " ('spark.scheduler.mode', 'FAIR'),\n", " ('spark.sql.adaptive.enabled', 'true'),\n", @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "e267adf2-b7a9-44b6-93ac-cf06344f07be", "metadata": {}, "outputs": [], @@ -145,6 +145,45 @@ "bq_client = bigquery.Client(project='msca-bdp-student-ap')\n" ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4136539d-0a5b-4acc-97ba-6d0d43a5d5d2", + "metadata": {}, + "outputs": [ + { + "ename": "AnalysisException", + "evalue": "Undefined function: 'ST_Contains'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[8], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m community_areas \u001b[38;5;241m=\u001b[39m spark\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigquery\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39moption(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtable\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchicago_rideshare.community_areas\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mload()\n\u001b[1;32m 5\u001b[0m ride_locations \u001b[38;5;241m=\u001b[39m spark\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigquery\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39moption(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtable\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchicago_rideshare.geo_rides_2018\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mload()\n\u001b[0;32m----> 7\u001b[0m result_df \u001b[38;5;241m=\u001b[39m \u001b[43mcommunity_areas\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mride_locations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mexpr\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mST_Contains(ST_GeomFromWKT(community_areas.geometry), ST_GeomFromWKT(ride_locations.pickup_geo))\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mgroupBy(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcommunity_areas.community\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39magg(count(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m*\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39malias(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_rides\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 12\u001b[0m result_df \u001b[38;5;241m=\u001b[39m result_df\u001b[38;5;241m.\u001b[39morderBy(desc(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_rides\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 14\u001b[0m result_df\u001b[38;5;241m.\u001b[39mshow()\n", + "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/dataframe.py:1339\u001b[0m, in \u001b[0;36mDataFrame.join\u001b[0;34m(self, other, on, how)\u001b[0m\n\u001b[1;32m 1337\u001b[0m on \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jseq([])\n\u001b[1;32m 1338\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(how, \u001b[38;5;28mstr\u001b[39m), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhow should be a string\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1339\u001b[0m jdf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mother\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1340\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DataFrame(jdf, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msql_ctx)\n", + "File \u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py:1304\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1298\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1300\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1301\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1303\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1304\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1305\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1307\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1308\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", + "File \u001b[0;32m/usr/lib/spark/python/pyspark/sql/utils.py:117\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 113\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", + "\u001b[0;31mAnalysisException\u001b[0m: Undefined function: 'ST_Contains'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import count, desc, expr\n", + "\n", + "community_areas = spark.read.format(\"bigquery\").option(\"table\", \"chicago_rideshare.community_areas\").load()\n", + "ride_locations = spark.read.format(\"bigquery\").option(\"table\", \"chicago_rideshare.geo_rides_2018\").load()\n", + "\n", + "result_df = community_areas.join(\n", + " ride_locations,\n", + " expr(\"ST_Contains(ST_GeomFromWKT(community_areas.geometry), ST_GeomFromWKT(ride_locations.pickup_geo))\")\n", + ").groupBy(\"community_areas.community\").agg(count(\"*\").alias(\"num_rides\"))\n", + "\n", + "result_df = result_df.orderBy(desc(\"num_rides\"))\n", + "\n", + "result_df.show()\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -212,14 +251,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "id": "93b5dd39-66c4-46a6-b61b-323186c13e68", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "04676ab6377b444d8bf4a894aafdb4a5", + "model_id": "e578fe2aa17e43d9b2e8317cc6746f9f", "version_major": 2, "version_minor": 0 }, @@ -233,7 +272,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b9456b629eea486eaea0bc273b857609", + "model_id": "ca5da0ef5d274f01ab2c7937d47b3941", "version_major": 2, "version_minor": 0 }, @@ -252,7 +291,7 @@ " count(*) as num_rides,\n", " community_areas.community\n", " FROM `chicago_rideshare.community_areas` as community_areas\n", - " INNER JOIN `chicago_rideshare.geo_program_area_2020` as ride_locations \n", + " INNER JOIN `chicago_rideshare.geo_rides_2018` as ride_locations \n", " ON (ST_CONTAINS(community_areas.geometry, ride_locations.pickup_geo))\n", " GROUP BY community_areas.community\n", " ORDER BY num_rides desc" @@ -260,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "id": "2efd32f7-498d-408d-bfb5-da3fde6ff81c", "metadata": {}, "outputs": [ @@ -292,31 +331,133 @@ " \n", " \n", " 0\n", - " 221531\n", - " HYDE PARK\n", + " 2829646\n", + " NEAR NORTH SIDE\n", " \n", " \n", " 1\n", - " 42102\n", - " WOODLAWN\n", + " 1478619\n", + " LOOP\n", " \n", " \n", " 2\n", - " 32421\n", - " KENWOOD\n", + " 1473545\n", + " NEAR WEST SIDE\n", + " \n", + " \n", + " 3\n", + " 1144197\n", + " LAKE VIEW\n", + " \n", + " \n", + " 4\n", + " 1092641\n", + " WEST TOWN\n", + " \n", + " \n", + " 5\n", + " 952820\n", + " LINCOLN PARK\n", + " \n", + " \n", + " 6\n", + " 646320\n", + " LOGAN SQUARE\n", + " \n", + " \n", + " 7\n", + " 366273\n", + " UPTOWN\n", + " \n", + " \n", + " 8\n", + " 322491\n", + " NEAR SOUTH SIDE\n", + " \n", + " \n", + " 9\n", + " 298187\n", + " OHARE\n", + " \n", + " \n", + " 10\n", + " 237934\n", + " HYDE PARK\n", + " \n", + " \n", + " 11\n", + " 232537\n", + " EDGEWATER\n", + " \n", + " \n", + " 12\n", + " 209190\n", + " NORTH CENTER\n", + " \n", + " \n", + " 13\n", + " 205838\n", + " LOWER WEST SIDE\n", + " \n", + " \n", + " 14\n", + " 187756\n", + " AVONDALE\n", + " \n", + " \n", + " 15\n", + " 174799\n", + " LINCOLN SQUARE\n", + " \n", + " \n", + " 16\n", + " 161243\n", + " IRVING PARK\n", + " \n", + " \n", + " 17\n", + " 158621\n", + " ROGERS PARK\n", + " \n", + " \n", + " 18\n", + " 148353\n", + " GARFIELD RIDGE\n", + " \n", + " \n", + " 19\n", + " 135942\n", + " AUSTIN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " num_rides community\n", - "0 221531 HYDE PARK\n", - "1 42102 WOODLAWN\n", - "2 32421 KENWOOD" + " num_rides community\n", + "0 2829646 NEAR NORTH SIDE\n", + "1 1478619 LOOP\n", + "2 1473545 NEAR WEST SIDE\n", + "3 1144197 LAKE VIEW\n", + "4 1092641 WEST TOWN\n", + "5 952820 LINCOLN PARK\n", + "6 646320 LOGAN SQUARE\n", + "7 366273 UPTOWN\n", + "8 322491 NEAR SOUTH SIDE\n", + "9 298187 OHARE\n", + "10 237934 HYDE PARK\n", + "11 232537 EDGEWATER\n", + "12 209190 NORTH CENTER\n", + "13 205838 LOWER WEST SIDE\n", + "14 187756 AVONDALE\n", + "15 174799 LINCOLN SQUARE\n", + "16 161243 IRVING PARK\n", + "17 158621 ROGERS PARK\n", + "18 148353 GARFIELD RIDGE\n", + "19 135942 AUSTIN" ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -325,6 +466,71 @@ "df.head(20)" ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c200551b-4931-4c1d-aa84-a6630247683f", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'DataFrame' object has no attribute 'write'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_22958/773907677.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"bigquery\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'overwrite'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"table\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"chicago_rideshare.temp2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5461\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5462\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5463\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5464\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5465\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'write'" + ] + } + ], + "source": [ + "df.write.format(\"bigquery\")\\\n", + " .mode('overwrite')\\\n", + " .option(\"table\",\"chicago_rideshare.temp2\")\\\n", + " .save()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4b00739a-8090-4672-9ff6-ba14b5a2ac03", + "metadata": {}, + "outputs": [ + { + "ename": "BadRequest", + "evalue": "400 SELECT list expression references community_areas.geometry which is neither grouped nor aggregated at [4:32]\n\nLocation: us-central1\nJob ID: 9f3e2ffd-2d79-4991-af19-9ebec1646310\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mBadRequest\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 12\u001b[0m\n\u001b[1;32m 1\u001b[0m query \u001b[38;5;241m=\u001b[39m bq_client\u001b[38;5;241m.\u001b[39mquery(\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m SELECT \u001b[39m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m count(*) as num_rides,\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m ORDER BY num_rides desc\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m)\n\u001b[0;32m---> 12\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/google/cloud/bigquery/job/query.py:1580\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retry_do_query \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m job_retry \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1578\u001b[0m do_get_result \u001b[38;5;241m=\u001b[39m job_retry(do_get_result)\n\u001b[0;32m-> 1580\u001b[0m \u001b[43mdo_get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1582\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mGoogleAPICallError \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 1583\u001b[0m exc\u001b[38;5;241m.\u001b[39mmessage \u001b[38;5;241m=\u001b[39m _EXCEPTION_FOOTER_TEMPLATE\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 1584\u001b[0m message\u001b[38;5;241m=\u001b[39mexc\u001b[38;5;241m.\u001b[39mmessage, location\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlocation, job_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mjob_id\n\u001b[1;32m 1585\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/google/api_core/retry.py:349\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 345\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 346\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 348\u001b[0m )\n\u001b[0;32m--> 349\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 350\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 352\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 353\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 354\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 355\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/google/api_core/retry.py:191\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, **kwargs)\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 190\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 191\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n", + "File \u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/google/cloud/bigquery/job/query.py:1570\u001b[0m, in \u001b[0;36mQueryJob.result..do_get_result\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1567\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retry_do_query \u001b[38;5;241m=\u001b[39m retry_do_query\n\u001b[1;32m 1568\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_job_retry \u001b[38;5;241m=\u001b[39m job_retry\n\u001b[0;32m-> 1570\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mQueryJob\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1572\u001b[0m \u001b[38;5;66;03m# Since the job could already be \"done\" (e.g. got a finished job\u001b[39;00m\n\u001b[1;32m 1573\u001b[0m \u001b[38;5;66;03m# via client.get_job), the superclass call to done() might not\u001b[39;00m\n\u001b[1;32m 1574\u001b[0m \u001b[38;5;66;03m# set the self._query_results cache.\u001b[39;00m\n\u001b[1;32m 1575\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reload_query_results(retry\u001b[38;5;241m=\u001b[39mretry, timeout\u001b[38;5;241m=\u001b[39mtimeout)\n", + "File \u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/google/cloud/bigquery/job/base.py:922\u001b[0m, in \u001b[0;36m_AsyncJob.result\u001b[0;34m(self, retry, timeout)\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_begin(retry\u001b[38;5;241m=\u001b[39mretry, timeout\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 921\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m {} \u001b[38;5;28;01mif\u001b[39;00m retry \u001b[38;5;129;01mis\u001b[39;00m DEFAULT_RETRY \u001b[38;5;28;01melse\u001b[39;00m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mretry\u001b[39m\u001b[38;5;124m\"\u001b[39m: retry}\n\u001b[0;32m--> 922\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m_AsyncJob\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/miniconda3/lib/python3.8/site-packages/google/api_core/future/polling.py:261\u001b[0m, in \u001b[0;36mPollingFuture.result\u001b[0;34m(self, timeout, retry, polling)\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_blocking_poll(timeout\u001b[38;5;241m=\u001b[39mtimeout, retry\u001b[38;5;241m=\u001b[39mretry, polling\u001b[38;5;241m=\u001b[39mpolling)\n\u001b[1;32m 258\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 259\u001b[0m \u001b[38;5;66;03m# pylint: disable=raising-bad-type\u001b[39;00m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;66;03m# Pylint doesn't recognize that this is valid in this case.\u001b[39;00m\n\u001b[0;32m--> 261\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\n", + "\u001b[0;31mBadRequest\u001b[0m: 400 SELECT list expression references community_areas.geometry which is neither grouped nor aggregated at [4:32]\n\nLocation: us-central1\nJob ID: 9f3e2ffd-2d79-4991-af19-9ebec1646310\n" + ] + } + ], + "source": [ + "query = bq_client.query(\"\"\"\n", + " SELECT \n", + " count(*) as num_rides,\n", + " community_areas.community, community_areas.geometry, ride_locations.pickup_geo\n", + " FROM `chicago_rideshare.community_areas` as community_areas\n", + " INNER JOIN `chicago_rideshare.geo_rides_2018` as ride_locations \n", + " ON (ST_CONTAINS(community_areas.geometry, ride_locations.pickup_geo))\n", + " GROUP BY community_areas.community\n", + " ORDER BY num_rides desc\n", + " \"\"\")\n", + "\n", + "results = query.result()\n" + ] + }, { "cell_type": "code", "execution_count": 21, @@ -362,6 +568,73 @@ " ST_WITHIN(ST_GEOGPOINT(rides.dropoff_lon, rides.dropoff_lat), areas.geometry)\n" ] }, + { + "cell_type": "code", + "execution_count": 16, + "id": "fb07a8d6-08f7-4488-b1f5-29d2e0dcfbbf", + "metadata": {}, + "outputs": [], + "source": [ + "sparkDF=spark.createDataFrame(query.to_dataframe()) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "966bc4df-a43d-47d0-8a2f-1b2fefad6cc9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 0:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+---------------+\n", + "|num_rides| community|\n", + "+---------+---------------+\n", + "| 2829646|NEAR NORTH SIDE|\n", + "| 1478619| LOOP|\n", + "| 1473545| NEAR WEST SIDE|\n", + "| 1144197| LAKE VIEW|\n", + "| 1092641| WEST TOWN|\n", + "| 952820| LINCOLN PARK|\n", + "| 646320| LOGAN SQUARE|\n", + "| 366273| UPTOWN|\n", + "| 322491|NEAR SOUTH SIDE|\n", + "| 298187| OHARE|\n", + "| 237934| HYDE PARK|\n", + "| 232537| EDGEWATER|\n", + "| 209190| NORTH CENTER|\n", + "| 205838|LOWER WEST SIDE|\n", + "| 187756| AVONDALE|\n", + "| 174799| LINCOLN SQUARE|\n", + "| 161243| IRVING PARK|\n", + "| 158621| ROGERS PARK|\n", + "| 148353| GARFIELD RIDGE|\n", + "| 135942| AUSTIN|\n", + "+---------+---------------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "sparkDF.show()" + ] + }, { "cell_type": "code", "execution_count": 20, @@ -369,52 +642,25 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] } ], "source": [ - "df2.head(20)" + "sparkDF.write.format(\"bigquery\")\\\n", + " .mode('overwrite')\\\n", + " .option(\"table\",\"chicago_rideshare.temp2\")\\\n", + " .option('temporaryGcsBucket', \"msca-bdp-student-gcs\" )\\\n", + " .save()" ] }, { "cell_type": "code", "execution_count": null, - "id": "c200551b-4931-4c1d-aa84-a6630247683f", + "id": "308f7aa9-0b01-4f01-bff3-c6b4ca19671a", "metadata": {}, "outputs": [], "source": []