Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

better gpkg support #4

Merged
merged 1 commit into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions environment-dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: geo2ml-dev
channels:
- conda-forge
- fastchan
- pytorch
- nvidia
dependencies:
- conda-forge::pip
- conda-forge::ipykernel
- fastchan::fastai==2.7.12
- pytorch::pytorch==2.0.1
- pytorch::torchvision
- pytorch::pytorch-cuda=11.7
- pytorch::torchaudio
- conda-forge::geopandas
- conda-forge::rasterio
- conda-forge::openpyxl
- conda-forge::seaborn
- conda-forge::xarray
- conda-forge::rioxarray
- conda-forge::dask
- conda-forge::dask-ml
- conda-forge::ipywidgets
- pip:
- black
- pycocotools
- "git+https://github.com/waspinator/pycococreator.git"
- rasterstats
- scikit-image
- ultralytics
- "git+https://github.com/facebookresearch/detectron2.git"
- nbdev
62 changes: 62 additions & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
name: geo2ml
channels:
- conda-forge
- defaults
dependencies:
- python

# DL

# Geo
- rasterio
- geopandas
- imgaug
- xarray
- rioxarray

# AutoML
- tpot
- xgboost

# DS
- numpy
- scipy
- scikit-learn>=0.22
- scikit-image
- pandas>=1.0.0
- opencv
- pillow>=6.0.0
- matplotlib
- seaborn
- plotly

- tqdm
- numba
- dask
- dask-ml
- joblib

# ipython + notebooks
- ipython
- notebook
- jupyterlab
- ipywidgets

# Testing + linting
- pylint
- pytest
- ruff

# Others
- openpyxl
- cupy
- pkg-config

- pip
- pip:
- black
- albumentations
- pyinstaller
- pycocotools
- "git+git://github.com/waspinator/pycococreator.git"
- rasterstats
103 changes: 72 additions & 31 deletions geo2ml/data/cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pycocotools.mask import frPyObjects
from shapely.geometry import MultiPolygon, Polygon
import math
import fiona

# %% ../../nbs/13_data.cv.ipynb 7
def calc_bearing(point1, point2):
Expand Down Expand Up @@ -127,25 +128,38 @@ def shp_to_coco(
rotated_bbox: bool = False,
dataset_name: str = None,
):
"Create a COCO style dataset from images in `raster_path` and corresponding polygons in `shp_path`, save annotations to `outpath`"
"""Create a COCO style dataset from images in `raster_path` and corresponding polygons in `shp_path`, save annotations to `outpath`.
`shp_path` can be either geopackage containing layers so that each layer corresponds to an image,
or a directory containing multiple shp or geojson files, each corresponding to an image
"""

coco_dict = {"images": [], "annotations": [], "categories": coco_categories}
if coco_info:
coco_dict["info"] = coco_info
if coco_licenses:
coco_dict["licenses"] = coco_licenses
categories = {c["name"]: c["id"] for c in coco_dict["categories"]}

vector_tiles = [f for f in os.listdir(shp_path) if f.endswith((".shp", ".geojson"))]
raster_tiles = [
f
for f in os.listdir(raster_path)
if f.split(".")[0] in [v.split(".")[0] for v in vector_tiles]
]
if os.path.isdir(shp_path):
vector_tiles = [
f for f in os.listdir(shp_path) if f.endswith((".shp", ".geojson"))
]
raster_tiles = [
f
for f in os.listdir(raster_path)
if f.split(".")[0] in [v.split(".")[0] for v in vector_tiles]
]
elif shp_path.suffix == ".gpkg":
layers = fiona.listlayers(
shp_path
) # Assume that shp_path contains a geopackage with layers named after images
raster_tiles = [f for f in os.listdir(raster_path) if f.split(".")[0] in layers]
ann_id = 1
for i, r in tqdm(enumerate(raster_tiles)):
tile_anns = []
gdf = gpd.read_file(shp_path / vector_tiles[i])
if os.path.isdir(shp_path):
gdf = gpd.read_file(shp_path / vector_tiles[i])
elif shp_path.suffix == ".gpkg":
gdf = gpd.read_file(shp_path, layer=layers[i])
tfmd_gdf = gdf_to_px(gdf, raster_path / r, precision=3)
for row in tfmd_gdf.itertuples():
category_id = categories[getattr(row, label_col)]
Expand All @@ -168,7 +182,7 @@ def shp_to_coco(
json.dump(coco_dict, f)
return

# %% ../../nbs/13_data.cv.ipynb 18
# %% ../../nbs/13_data.cv.ipynb 19
def coco_to_shp(
coco_data: Path | str, outpath: Path, raster_path: Path, downsample_factor: int = 1
):
Expand Down Expand Up @@ -279,7 +293,7 @@ def coco_to_shp(
tfmd_gdf.to_file(outpath / f'{i["file_name"][:-4]}.geojson', driver="GeoJSON")
return

# %% ../../nbs/13_data.cv.ipynb 22
# %% ../../nbs/13_data.cv.ipynb 23
def shp_to_coco_results(
prediction_path: Path,
raster_path: Path,
Expand All @@ -288,19 +302,29 @@ def shp_to_coco_results(
label_col: str = "label_id",
rotated_bbox: bool = False,
):
"Convert vector predictions into coco result format to be fed into COCO evaluator"
"""Convert vector predictions into coco result format to be fed into COCO evaluator

`prediction_path` can be either geopackage containing layers so that each layer corresponds to an image,
or a directory containing multiple shp or geojson files, each corresponding to an image
"""

with open(coco_dict) as f:
coco_dict = json.load(f)

vector_tiles = [
f for f in os.listdir(prediction_path) if f.endswith((".shp", ".geojson"))
]
raster_tiles = [
f
for f in os.listdir(raster_path)
if f.split(".")[0] in [v.split(".")[0] for v in vector_tiles]
]
if os.path.isdir(prediction_path):
vector_tiles = [
f for f in os.listdir(prediction_path) if f.endswith((".shp", ".geojson"))
]
raster_tiles = [
f
for f in os.listdir(raster_path)
if f.split(".")[0] in [v.split(".")[0] for v in vector_tiles]
]
elif prediction_path.suffix == ".gpkg":
layers = fiona.listlayers(
shp_path
) # Assume that shp_path contains a geopackage with layers named after images
raster_tiles = [f for f in os.listdir(raster_path) if f.split(".")[0] in layers]
results = []
for i in tqdm(range_of(raster_tiles)):
for im_id, im in enumerate(coco_dict["images"]):
Expand All @@ -309,7 +333,10 @@ def shp_to_coco_results(
image_id = coco_dict["images"][im_id]["id"]
h = coco_dict["images"][im_id]["height"]
w = coco_dict["images"][im_id]["width"]
gdf = gpd.read_file(f"{prediction_path}/{vector_tiles[i]}")
if os.path.isdir(prediction_path):
gdf = gpd.read_file(prediction_path / vector_tiles[i])
elif shp_path.suffix == ".gpkg":
gdf = gpd.read_file(prediction_path, layer=layers[i])
tfmd_gdf = gdf_to_px(gdf, raster_path / raster_tiles[i], precision=3)
for row in tfmd_gdf.itertuples():
res = {
Expand All @@ -333,7 +360,7 @@ def shp_to_coco_results(
json.dump(results, f)
return

# %% ../../nbs/13_data.cv.ipynb 27
# %% ../../nbs/13_data.cv.ipynb 28
def shp_to_yolo(
raster_path: Path,
shp_path: Path,
Expand All @@ -344,18 +371,32 @@ def shp_to_yolo(
min_bbox_area: int = 0,
dataset_name: str = None,
):
"Convert shapefiles in `shp_path` to YOLO style dataset. Creates a folder `labels` and `dataset_name.yaml` to `outpath`"
vector_tiles = [f for f in os.listdir(shp_path) if f.endswith((".shp", ".geojson"))]
raster_tiles = [
f
for f in os.listdir(raster_path)
if f.split(".")[0] in [v.split(".")[0] for v in vector_tiles]
]
"""Convert shapefiles in `shp_path` to YOLO style dataset. Creates a folder `labels` and `dataset_name.yaml` to `outpath`
`shp_path` can be either geopackage containing layers so that each layer corresponds to an image,
or a directory containing multiple shp or geojson files, each corresponding to a single image.
"""
if os.path.isdir(shp_path):
vector_tiles = [
f for f in os.listdir(shp_path) if f.endswith((".shp", ".geojson"))
]
raster_tiles = [
f
for f in os.listdir(raster_path)
if f.split(".")[0] in [v.split(".")[0] for v in vector_tiles]
]
elif shp_path.suffix == ".gpkg":
layers = fiona.listlayers(
shp_path
) # Assume that shp_path contains a geopackage with layers named after images
raster_tiles = [f for f in os.listdir(raster_path) if f.split(".")[0] in layers]
ann_path = outpath / "labels"
os.makedirs(ann_path, exist_ok=True)
names = {n: i for i, n in enumerate(names)}
for i, r in tqdm(enumerate(raster_tiles)):
gdf = gpd.read_file(shp_path / vector_tiles[i])
if os.path.isdir(shp_path):
gdf = gpd.read_file(shp_path / vector_tiles[i])
elif shp_path.suffix == ".gpkg":
gdf = gpd.read_file(shp_path, layer=layers[i])
if ann_format == "rotated box":
gdf["geometry"] = gdf.geometry.apply(
lambda row: row.minimum_rotated_rectangle
Expand Down Expand Up @@ -407,7 +448,7 @@ def shp_to_yolo(
for n in names.keys():
dest.write(f" {names[n]}: {n}\n")

# %% ../../nbs/13_data.cv.ipynb 33
# %% ../../nbs/13_data.cv.ipynb 34
def yolo_to_shp(
prediction_path: Path,
raster_path: Path,
Expand Down
68 changes: 47 additions & 21 deletions geo2ml/data/tiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import rasterio as rio
import rasterio.mask as rio_mask
import rasterio.windows as rio_windows
import fiona
from rasterio.merge import merge as rio_merge
from sklearn.preprocessing import LabelEncoder
from .postproc import *
Expand Down Expand Up @@ -108,10 +109,15 @@ def tile_raster(
return

def tile_vector(
self, path_to_vector: str, min_area_pct: float = 0.0, gpkg_layer: str = None
self,
path_to_vector: str,
min_area_pct: float = 0.0,
gpkg_layer: str = None,
output_format: str = "geojson",
) -> None:
"""
Tiles a vector data file into smaller tiles. Converts all multipolygons to a regular polygons. `min_area_pct` is be used to specify the minimum area for partial masks to keep. Default value 0.0 keeps all masks.
Tiles a vector data file into smaller tiles. Converts all multipolygons to a regular polygons.
`min_area_pct` is be used to specify the minimum area for partial masks to keep. Default value 0.0 keeps all masks.
"""
if self.grid is None:
raise Exception(
Expand All @@ -123,8 +129,13 @@ def tile_vector(
"`sampling_locations` is .gpkg but no `gpkg_layer` specified"
)

if not os.path.exists(self.vector_path):
os.makedirs(self.vector_path)
if output_format == "geojson":
if not os.path.exists(self.vector_path):
os.makedirs(self.vector_path)
elif output_format == "gpkg":
outfile = self.outpath / "vectors.gpkg"
else:
raise Exception("Unknown output format, must be either `geojson` or `gpkg`")

vector = gpd.read_file(path_to_vector, layer=gpkg_layer)

Expand All @@ -149,10 +160,12 @@ def tile_vector(
else shapely.geometry.Polygon(row.geometry.exterior),
axis=1,
)

tempvector.to_file(
f"{self.vector_path}/{row.cell}.geojson", driver="GeoJSON"
)
if output_format == "geojson":
tempvector.to_file(
f"{self.vector_path}/{row.cell}.geojson", driver="GeoJSON"
)
elif output_format == "gpkg":
tempvector.to_file(outfile, layer=row.cell)
return

def tile_and_rasterize_vector(
Expand All @@ -163,7 +176,10 @@ def tile_and_rasterize_vector(
gpkg_layer: str = None,
keep_bg_only: bool = False,
) -> None:
"""Rasterizes vectors based on tiled rasters. Saves label map to `self.outpath`. By default only keeps the patches that contain polygon data, by specifying `keep_bg_only=True` saves also masks for empty patches."""
"""
Rasterizes vectors based on tiled rasters. Saves label map to `self.outpath`.
By default only keeps the patches that contain polygon data, by specifying `keep_bg_only=True` saves also masks for empty patches.
"""

if self.grid is None:
raise Exception(
Expand Down Expand Up @@ -210,7 +226,7 @@ def tile_and_rasterize_vector(
dest.write_band(1, burned)
return

# %% ../../nbs/12_data.tiling.ipynb 27
# %% ../../nbs/12_data.tiling.ipynb 30
def untile_raster(path_to_targets: str, outfile: str, method: str = "first"):
"""Merge multiple patches from `path_to_targets` into a single raster`"""

Expand Down Expand Up @@ -258,17 +274,27 @@ def untile_vector(
non_max_suppression_thresh: float = 0.0,
nms_criterion: str = "score",
):
"Create single shapefile from a directory of predicted .shp or .geojson files"
pred_files = [
f for f in os.listdir(path_to_targets) if f.endswith((".shp", ".geojson"))
]
gdf = None
for p in tqdm(pred_files):
temp_gdf = gpd.read_file(f"{path_to_targets}/{p}")
if gdf is None:
gdf = temp_gdf
else:
gdf = pd.concat((gdf, temp_gdf))
"Create single GIS-filie from a directory of predicted .shp or .geojson files"
if os.path.isdir(path_to_targets): # directory
pred_files = [
f for f in os.listdir(path_to_targets) if f.endswith((".shp", ".geojson"))
]
gdf = None
for p in tqdm(pred_files):
temp_gdf = gpd.read_file(f"{path_to_targets}/{p}")
if gdf is None:
gdf = temp_gdf
else:
gdf = pd.concat((gdf, temp_gdf))
elif path_to_targets.endswith("gpkg"): # geopackage
layers = fiona.listlayers(path_to_targets)
gdf = None
for l in tqdm(layers):
temp_gdf = gpd.read_file(path_to_targets, layer=l)
if gdf is None:
gdf = temp_gdf
else:
gdf = pd.concat((gdf, temp_gdf))
print(f"{len(gdf)} polygons before non-max suppression")
if non_max_suppression_thresh != 0:
np_bounding_boxes = np.array([b.bounds for b in gdf.geometry])
Expand Down
Loading