From aab7d4d204f97b89193319c66b69ddd0b5d11a67 Mon Sep 17 00:00:00 2001 From: Joseph Xu Date: Thu, 2 Feb 2023 23:50:49 -0800 Subject: [PATCH] Add functions to automatically search for images covering buildings. PiperOrigin-RevId: 506823539 --- src/generate_examples_main.py | 28 ++++-- src/skai/earth_engine.py | 172 +++++++++++++++++++++++++++++++++- src/skai/generate_examples.py | 10 +- 3 files changed, 198 insertions(+), 12 deletions(-) diff --git a/src/generate_examples_main.py b/src/generate_examples_main.py index 8cf8f87b..34ff8f7f 100644 --- a/src/generate_examples_main.py +++ b/src/generate_examples_main.py @@ -45,6 +45,7 @@ from absl import logging import shapely.geometry from skai import buildings +from skai import earth_engine from skai import generate_examples from skai import read_raster import tensorflow as tf @@ -200,12 +201,6 @@ def main(args): ) gdal_env = generate_examples.parse_gdal_env(config.gdal_env) - if config.before_image_patterns: - before_image_patterns = config.before_image_patterns - elif config.before_image_config: - before_image_patterns = _read_image_config(config.before_image_config) - else: - before_image_patterns = [] if config.after_image_patterns: after_image_patterns = config.after_image_patterns @@ -248,6 +243,27 @@ def main(args): else: labeled_coordinates = [] + if config.before_image_patterns: + before_image_patterns = config.before_image_patterns + elif config.before_image_config: + before_image_patterns = _read_image_config(config.before_image_config) + elif config.auto_select_before_images: + # TODO(jzxu): Remove potential multiple Earth Engine initialization calls. + earth_engine.initialize(config.earth_engine_service_account, + config.earth_engine_private_key) + image_ids = earth_engine.find_covering_images( + building_centroids, + config.before_image_collection_id, + config.before_image_project_ids, + config.before_image_start_date, + config.before_image_end_date, + config.max_before_images, + ) + before_image_patterns = [f'EEDAI:{i}' for i in image_ids] + logging.info('Chosen before images: %s', before_image_patterns) + else: + before_image_patterns = [] + generate_examples.generate_examples_pipeline( before_image_patterns, after_image_patterns, diff --git a/src/skai/earth_engine.py b/src/skai/earth_engine.py index 43fbc6ab..e4212c6a 100644 --- a/src/skai/earth_engine.py +++ b/src/skai/earth_engine.py @@ -22,12 +22,18 @@ from absl import logging import ee # pytype: disable=import-error # mapping-is-not-sequence import geopandas as gpd +import numpy as np import pandas as pd import shapely.geometry import tensorflow as tf ShapelyGeometry = shapely.geometry.base.BaseGeometry +# When auto-selecting before images, this is the maximum number of images to +# choose from. If there are more images available in the date range and AOI +# specified, only the most recent images will be considered. +_MAX_BEFORE_IMAGE_POOL = 100 + def _shapely_to_ee_feature(shapely_geometry: ShapelyGeometry) -> ee.Feature: """Converts shapely geometry into Earth Engine Feature.""" @@ -43,6 +49,24 @@ def _get_open_building_feature_centroid(feature: ee.Feature) -> ee.Feature: def _download_feature_collection( + collection: ee.FeatureCollection) -> gpd.GeoDataFrame: + """Downloads a FeatureCollection from Earth Engine as a GeoDataFrame. + + Args: + collection: EE FeatureCollection to download. + + Returns: + FeatureCollection data as a GeoDataFrame. + """ + url = collection.getDownloadURL('CSV') + with urllib.request.urlopen(url) as url_file: + df = pd.read_csv(url_file) + geometry = df['.geo'].apply(json.loads).apply(shapely.geometry.shape) + properties = df.drop(columns=['.geo']) + return gpd.GeoDataFrame(properties, geometry=geometry, crs=4326) + + +def _download_feature_collection_to_file( collection: ee.FeatureCollection, properties: List[str], output_path: str) -> gpd.GeoDataFrame: """Downloads a FeatureCollection from Earth Engine as a GeoDataFrame. @@ -55,7 +79,7 @@ def _download_feature_collection( Returns: FeatureCollection data as a GeoDataFrame. """ - url = collection.getDownloadURL('csv', properties) + url = collection.getDownloadURL('CSV', properties) with urllib.request.urlopen(url) as url_file, tf.io.gfile.GFile( output_path, 'wb') as output: shutil.copyfileobj(url_file, output) @@ -75,7 +99,65 @@ def _download_feature_collection( geometry = None properties = None - return gpd.GeoDataFrame(properties, geometry=geometry) + return gpd.GeoDataFrame(properties, geometry=geometry, crs=4326) + + +def _image_to_feature(image_object: ee.ComputedObject) -> ee.Feature: + """Converts an image into a feature. + + The feature's geometry will be an approximate outline of the non-blank pixels + of the image. The feature will have important properties of the image. + + Args: + image_object: The image. + + Returns: + A feature representing the image. + """ + image = ee.Image(image_object) + feature = image.select(0).multiply(0).reduceToVectors(scale=100).first() + feature = feature.set({ + 'image_id': image.get('system:id'), + 'resolution': image.projection().nominalScale(), + 'time': image.get('system:time_start'), + 'provider': image.get('provider_id'), + 'name': image.get('name'), + 'label': None, + 'count': None + }) + return feature + + +def _get_image_collection_gdf( + image_collection: ee.ImageCollection, + max_images: int) -> gpd.GeoDataFrame: + """Returns a GeoDataFrame for the images in a collection. + + Each row of the GeoDataFrame corresponds to one image in the collection. + The row's geometry will be the image's geometry. The GeoDataFrame will have + the following properties: + + image_id - EE id of the image. + resolution - Resolution of the image in meters. + time - Timestamp for when the image was taken. + provider - Image provider code. + name - The image name in EE. + + Args: + image_collection: The image collection. + max_images: Maximum number of rows to return. + + Returns: + GeoDataFrame. + """ + fc = ee.FeatureCollection( + image_collection.limit(max_images, 'system:time_start', False) + .toList(max_images) + .map(_image_to_feature) + ) + gdf = _download_feature_collection(fc) + gdf['time'] = pd.to_datetime(gdf['time'], unit='ms') + return gdf def get_open_buildings(regions: List[ShapelyGeometry], @@ -99,10 +181,13 @@ def get_open_buildings(regions: List[ShapelyGeometry], aoi_buildings = open_buildings.filterBounds(bounds) if as_centroids: centroids = aoi_buildings.map(_get_open_building_feature_centroid) - return _download_feature_collection(centroids, ['longitude', 'latitude'], - output_path) + return _download_feature_collection_to_file( + centroids, ['longitude', 'latitude'], output_path + ) else: - return _download_feature_collection(aoi_buildings, ['.geo'], output_path) + return _download_feature_collection_to_file( + aoi_buildings, ['.geo'], output_path + ) def get_open_buildings_centroids( @@ -149,3 +234,80 @@ def initialize(service_account: str, private_key: Optional[str]) -> bool: logging.error('Error initializing Earth Engine: %s', e) return False return True + + +def find_covering_images( + building_centroids: List[Tuple[float, float]], + image_collection_id: str, + project_ids: List[int], + start_date: str, + end_date: str, + max_images: int) -> List[str]: + """Finds a set of images that best covers a set of points. + + Args: + building_centroids: Buildings to cover. + image_collection_id: EE image collection to search from. + project_ids: Project ids for images. + start_date: Starting date, e.g. 2021-01-01. + end_date: Ending date, e.g. 2023-12-31. + max_images: Maximum number of images to return. + + Returns: + A list of image ids that best covers the points. + """ + points = gpd.GeoDataFrame( + geometry=[shapely.geometry.Point(c) for c in building_centroids], + crs=4326 + ) + aoi = _shapely_to_ee_feature(points.unary_union.convex_hull).geometry() + image_collection = ee.ImageCollection(image_collection_id) + if project_ids: + project_filters = [ + ee.Filter.listContains('project_membership', pid) for pid in project_ids + ] + image_collection = image_collection.filter(ee.Filter.Or(*project_filters)) + + images = image_collection.filterBounds(aoi).filterDate(start_date, end_date) + images_gdf = _get_image_collection_gdf(images, _MAX_BEFORE_IMAGE_POOL) + + # Create a MxN matrix where M is the number of points and N is the number of + # images. The element (i, j) is True if and only if point i intersects image + # j. + images_gdf_for_join = gpd.GeoDataFrame(geometry=images_gdf.geometry, crs=4326) + joined = gpd.sjoin(images_gdf_for_join, points) + intersections = np.zeros((len(points), len(images_gdf)), dtype=bool) + intersections[joined.index_right.values, joined.index.values] = True + + # Use a greedy algorithm to choose images. Always choose the image that covers + # the most number of remaining points. + uncovered_points = np.ones(len(points), dtype=bool) + unused_images = np.arange(len(images_gdf)) + chosen_image_ids = [] + while ( + np.any(uncovered_points) + and len(unused_images) + and len(chosen_image_ids) < max_images + ): + # num_points_covered is a K-length array, where K is the number of unused + # images. num_points_covered[i] contains the number of remaining points + # covered by the ith unused image. + num_points_covered = np.sum( + intersections[np.ix_(uncovered_points, unused_images)], axis=0) + assert len(num_points_covered) == len(unused_images) + best_index = np.argmax(num_points_covered) + if num_points_covered[best_index] == 0: + # Remaining images don't cover any more points, so terminate. + break + best_image = unused_images[best_index] + chosen_image_ids.append(images_gdf.iloc[best_image]['image_id']) + unused_images = np.delete(unused_images, [best_index]) + uncovered_points &= ~intersections[:, best_image] + num_points_left = np.sum(uncovered_points) + print( + f'Chose image {best_image}, ' + f'covers {num_points_covered[best_index]} additional points, ' + f'{num_points_left} points left.' + ) + + return chosen_image_ids diff --git a/src/skai/generate_examples.py b/src/skai/generate_examples.py index 1ecc3457..89a9119d 100644 --- a/src/skai/generate_examples.py +++ b/src/skai/generate_examples.py @@ -97,6 +97,14 @@ class ExamplesGenerationConfig: num_keep_labeled_examples: int = 1000 configuration_path: Optional[str] = None + # Parameters for auto-selecting before images. + auto_select_before_images: bool = False + before_image_collection_id: str = '' + before_image_project_ids: List[int] = dataclasses.field(default_factory=list) + before_image_start_date: str = '' + before_image_end_date: str = '' + max_before_images: int = 100 + # TODO(mohammedelfatihsalah): Add a type for flagvalues argument in init_from_flags. @staticmethod def init_from_flags(flagvalues): @@ -162,7 +170,7 @@ def init_from_json_path(json_path: str): ' will be it %s will be used.' ), field.name, - val + field.default ) return config