diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 65e933960..cf5b93895 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -20,6 +20,14 @@ This part of documentation collects descriptive release notes to capture the mai * Add a list of PyPSA-Earth applications in academic and industrial projects `PR #1255 `__ +* Computational improvements of build_osm_network `PR #845 `__ + +* Boost computational performances of set_lines_ids with cKDTree by scipy `PR #806 `__ + +* Boost computational performances of set_substation_ids using DBSCAN `PR #799 `__ + +* Boost computational performances of fix_overpassing_line `PR #807 `__ + **Minor Changes and bug-fixing** * Added electricity bus to Fischer-Tropsch in prepare_sector_network.py `PR #1226 `__ diff --git a/scripts/build_osm_network.py b/scripts/build_osm_network.py index 867262abc..b4e7e6f4c 100644 --- a/scripts/build_osm_network.py +++ b/scripts/build_osm_network.py @@ -17,8 +17,10 @@ read_osm_config, to_csv_nafix, ) -from shapely.geometry import LineString, Point -from shapely.ops import linemerge, split +from scipy.spatial import cKDTree +from shapely.geometry import LineString, MultiLineString, Point +from shapely.ops import linemerge, nearest_points, split +from sklearn.cluster import DBSCAN from tqdm import tqdm logger = create_logger(__name__) @@ -41,76 +43,32 @@ def line_endings_to_bus_conversion(lines): return lines -# tol in m -def set_substations_ids(buses, distance_crs, tol=2000): - """ - Function to set substations ids to buses, accounting for location - tolerance. - - The algorithm is as follows: - - 1. initialize all substation ids to -1 - 2. if the current substation has been already visited [substation_id < 0], then skip the calculation - 3. otherwise: - 1. identify the substations within the specified tolerance (tol) - 2. when all the substations in tolerance have substation_id < 0, then specify a new substation_id - 3. otherwise, if one of the substation in tolerance has a substation_id >= 0, then set that substation_id to all the others; - in case of multiple substations with substation_ids >= 0, the first value is picked for all +def set_substations_ids(buses, distance_crs, tol=5000): """ + Assigns station IDs to buses based on their proximity. - buses["station_id"] = -1 + Parameters: + - buses: GeoDataFrame object representing the buses data. + - distance_crs: Coordinate reference system (CRS) to convert the geometry to. + - tol: Tolerance distance in chosen CRS to define cluster proximity. - # create temporary series to execute distance calculations using m as reference distances - temp_bus_geom = buses.geometry.to_crs(distance_crs) + Returns: + - None. Modifies the 'station_id' column in the 'buses' GeoDataFrame. - # set tqdm options for substation ids - tqdm_kwargs_substation_ids = dict( - ascii=False, - unit=" buses", - total=buses.shape[0], - desc="Set substation ids ", - ) + Example: + set_substations_ids(buses_data, 'EPSG:3857', tol=5000) + """ - station_id = 0 - for i, row in tqdm(buses.iterrows(), **tqdm_kwargs_substation_ids): - if buses.loc[i, "station_id"] >= 0: - continue + # Convert the geometry to EPSG:3857 + tmp_geometry = buses.geometry.to_crs(distance_crs) - # get substations within tolerance - close_nodes = np.flatnonzero( - temp_bus_geom.distance(temp_bus_geom.loc[i]) <= tol - ) + coords = tmp_geometry.apply(lambda geom: np.array(geom.coords[0])).to_list() - if len(close_nodes) == 1: - # if only one substation is in tolerance, then the substation is the current one iƬ - # Note that the node cannot be with substation_id >= 0, given the preliminary check - # at the beginning of the for loop - buses.loc[buses.index[i], "station_id"] = station_id - # update station id - station_id += 1 - else: - # several substations in tolerance - # get their ids - subset_substation_ids = buses.loc[buses.index[close_nodes], "station_id"] - # check if all substation_ids are negative (<0) - all_neg = subset_substation_ids.max() < 0 - # check if at least a substation_id is negative (<0) - some_neg = subset_substation_ids.min() < 0 - - if all_neg: - # when all substation_ids are negative, then this is a new substation id - # set the current station_id and increment the counter - buses.loc[buses.index[close_nodes], "station_id"] = station_id - station_id += 1 - elif some_neg: - # otherwise, when at least a substation_id is non-negative, then pick the first value - # and set it to all the other substations within tolerance - sub_id = -1 - for substation_id in subset_substation_ids: - if substation_id >= 0: - sub_id = substation_id - break - buses.loc[buses.index[close_nodes], "station_id"] = sub_id + # Perform DBSCAN on the coordinates + db = DBSCAN(eps=tol, min_samples=1).fit(coords) + + # Add the cluster labels to the GeoDataFrame + buses["station_id"] = db.labels_ def set_lines_ids(lines, buses, distance_crs): @@ -118,69 +76,66 @@ def set_lines_ids(lines, buses, distance_crs): Function to set line buses ids to the closest bus in the list. """ # set tqdm options for set lines ids - tqdm_kwargs_line_ids = dict( - ascii=False, - unit=" lines", - total=lines.shape[0], - desc="Set line bus ids ", - ) + lines_d = lines.to_crs(distance_crs) + buses_d = buses.to_crs(distance_crs) # initialization lines["bus0"] = -1 lines["bus1"] = -1 - busesepsg = buses.to_crs(distance_crs) - linesepsg = lines.to_crs(distance_crs) - - for i, row in tqdm(linesepsg.iterrows(), **tqdm_kwargs_line_ids): - # select buses having the voltage level of the current line - buses_sel = busesepsg[ - (buses["voltage"] == row["voltage"]) & (buses["dc"] == row["dc"]) - ] + for key, lines_sel in lines_d.groupby(["voltage", "dc"]): + buses_sel = buses_d.query(f"voltage == {key[0]} and dc == {key[1]}") # find the closest node of the bus0 of the line - bus0_id = buses_sel.geometry.distance(row.geometry.boundary.geoms[0]).idxmin() - lines.loc[i, "bus0"] = buses.loc[bus0_id, "bus_id"] - - # check if the line starts exactly in the node, otherwise modify the linestring - distance_bus0 = busesepsg.geometry.loc[bus0_id].distance( - row.geometry.boundary.geoms[0] + bus0_points = np.array( + list( + lines_sel.geometry.boundary.apply( + lambda x: (x.geoms[0].x, x.geoms[0].y) + ) + ) ) - if distance_bus0 > 0.0: - # the line does not start in the node, thus modify the linestring - lines.loc[i, "geometry"] = linemerge( - [ - LineString( - [ - buses.loc[bus0_id, "geometry"], - lines.loc[i, "geometry"].boundary.geoms[0], - ] - ), - lines.loc[i, "geometry"], - ] + bus1_points = np.array( + list( + lines_sel.geometry.boundary.apply( + lambda x: (x.geoms[1].x, x.geoms[1].y) + ) ) - - # find the closest node of the bus1 of the line - bus1_id = buses_sel.geometry.distance(row.geometry.boundary.geoms[1]).idxmin() - lines.loc[i, "bus1"] = buses.loc[bus1_id, "bus_id"] - - # check if the line ends exactly in the node, otherwise modify the linestring - distance_bus1 = busesepsg.geometry.loc[bus1_id].distance( - row.geometry.boundary.geoms[1] ) - if distance_bus1 > 0.0: - # the line does not end in the node, thus modify the linestring - lines.loc[i, "geometry"] = linemerge( - [ - lines.loc[i, "geometry"], - LineString( - [ - lines.loc[i, "geometry"].boundary.geoms[1], - buses.loc[bus1_id, "geometry"], - ] - ), - ] + points_buses = np.array(list(buses_sel.geometry.apply(lambda x: (x.x, x.y)))) + + btree = cKDTree(points_buses) + dist0, idx0 = btree.query(bus0_points, k=1) # find closest points of bus0 + dist1, idx1 = btree.query(bus1_points, k=1) # find closest points of bus1 + + # set bus0 and bus1 + lines.loc[lines_sel.index, "bus0"] = buses_sel.bus_id.iloc[idx0].values + lines.loc[lines_sel.index, "bus1"] = buses_sel.bus_id.iloc[idx1].values + + # check if the line starts exactly in the bus0, otherwise modify the linestring + bus0_linestring = ( + lines.loc[lines_sel.index] + .apply( + lambda x: LineString([buses.geometry.loc[x["bus0"]], x["bus_0_coors"]]), + axis=1, ) + .set_crs(crs=lines.crs) + ) + bus1_linestring = ( + lines.loc[lines_sel.index] + .apply( + lambda x: LineString([x["bus_1_coors"], buses.geometry.loc[x["bus1"]]]), + axis=1, + ) + .set_crs(crs=lines.crs) + ) + + # update geometry with left and right linestrings to match bus0 and bus1 + lines.loc[lines_sel.index, "geometry"] = ( + lines.loc[lines_sel.index] + .union(bus0_linestring) + .union(bus1_linestring) + .apply(linemerge) + ) return lines, buses @@ -503,14 +458,6 @@ def set_lv_substations(buses): return buses -# Note tolerance = 0.01 means around 700m -# TODO: the current tolerance is high to avoid an issue in the Nigeria case where line 565939360-1 -# seems to be interconnected to both ends, but at the eastern one, the node is actually not connected -# another line seems to be exactly touching the node, but from the data point of view it only fly over it. -# There may be the need to split a line in several segments in the case the line is within tolerance with -# respect to a node - - def merge_stations_lines_by_station_id_and_voltage( lines, buses, geo_crs, distance_crs, tol=2000 ): @@ -520,19 +467,19 @@ def merge_stations_lines_by_station_id_and_voltage( """ logger.info( - "Stage 3a/4: Set substation ids with tolerance of %.2f km" % (tol / 1000) + "Stage 4a/5: Set substation ids with tolerance of %.2f km" % (tol / 1000) ) # set substation ids set_substations_ids(buses, distance_crs, tol=tol) - logger.info("Stage 3b/4: Merge substations with the same id") + logger.info("Stage 4b/5: Merge substations with the same id") # merge buses with same station id and voltage if not buses.empty: buses = merge_stations_same_station_id(buses) - logger.info("Stage 3c/4: Specify the bus ids of the line endings") + logger.info("Stage 4c/5: Specify the bus ids of the line endings") # set the bus ids to the line dataset lines, buses = set_lines_ids(lines, buses, distance_crs) @@ -545,7 +492,7 @@ def merge_stations_lines_by_station_id_and_voltage( # set substation_lv set_lv_substations(buses) - logger.info("Stage 3d/4: Add converters to lines") + logger.info("Stage 4d/5: Add converters to lines") # append fake converters # lines = pd.concat([lines, converters], ignore_index=True) @@ -558,171 +505,131 @@ def merge_stations_lines_by_station_id_and_voltage( return lines, buses -def create_station_at_equal_bus_locations( - lines, buses, geo_crs, distance_crs, tol=2000 -): - # V1. Create station_id at same bus location - # - We saw that buses are not connected exactly at one point, they are - # usually connected to a substation "area" (analysed on maps) - # - Create station_id at exactly the same location might therefore be not - # always correct - # - Though as you can see below, it might be still sometime the case. - # Examples are **station 4** (2 lines with the same voltage connect at the - # same point) and **station 23** (4 lines with two different voltages connect - # at the same point) - # TODO: Filter out the generator lines - defined as going from generator to - # the next station which is connected to a load. Excluding generator - # lines make probably sense because they are not transmission expansion - # relevant. For now we simplify and include generator lines. - - # If same location/geometry make station - bus_all = buses - - # set substation ids - set_substations_ids(buses, distance_crs, tol=tol) - - # set the bus ids to the line dataset - lines, buses = set_lines_ids(lines, buses, distance_crs) - - # update line endings - lines = line_endings_to_bus_conversion(lines) - - # For each station number with multiple buses make lowest voltage `substation_lv = TRUE` - set_lv_substations(bus_all) - - # TRY: Keep only buses that are not duplicated & lv_substation = True - # TODO: Check if this is necessary. What effect do duplicates have? - bus_all = bus_all[bus_all["substation_lv"] == True] - - lines = connect_stations_same_station_id(lines, buses) - - return lines, buses - - -def _split_linestring_by_point(linestring, points): +def fix_overpassing_lines(lines, buses, distance_crs, tol=1): """ - Function to split a linestring geometry by multiple inner points. + Snap buses to lines that are within a certain tolerance. It does this by + first buffering the buses by the tolerance distance, and then performing a + spatial join to find all lines that intersect with the buffers. For each + group of lines that intersect with a buffer, the function identifies the + points that overpass the line (i.e., are not snapped to the line), and then + snaps those points to the nearest point on the line. The line is then split + at each snapped point, resulting in a new set of lines that are snapped to + the buses. The function returns a GeoDataFrame containing the snapped + lines, and the original GeoDataFrame containing the buses. Parameters ---------- - lstring : LineString - Linestring of the line to be split - points : list - List of points to split the linestring - - Return - ------ - list_lines : list - List of linestring to split the line + lines : GeoDataFrame + GeoDataFrame containing the lines + buses : GeoDataFrame + GeoDataFrame containing the buses + distance_crs : str + Coordinate reference system to use for distance calculations + tol : float + Tolerance in meters to snap the buses to the lines + + Returns + ------- + lines : GeoDataFrame + GeoDataFrame containing the lines """ + if lines.empty: + return lines, buses - list_linestrings = [linestring] + df_l = lines.copy() # can use lines directly without copying + # drop all columns except id and geometry for buses + df_p = buses.copy() - for p in points: - # execute split to all lines and store results - temp_list = [split(l, p) for l in list_linestrings] - # nest all geometries - list_linestrings = [lstring for tval in temp_list for lstring in tval.geoms] + line_id_str = "line_id" + bus_id_str = "bus_id" - return list_linestrings + # change crs to distance based + df_l = df_l.to_crs(distance_crs) + df_p = df_p.to_crs(distance_crs) + # set index to bus_id + df_p.set_index(bus_id_str, inplace=True) -def fix_overpassing_lines(lines, buses, distance_crs, tol=1): - """ - Function to avoid buses overpassing lines with no connection when the bus - is within a given tolerance from the line. + # Buffer points to create areas for spatial join + buffer_df = df_p.buffer(tol).to_frame() - Parameters - ---------- - lines : GeoDataFrame - Geodataframe of lines - buses : GeoDataFrame - Geodataframe of substations - tol : float - Tolerance in meters of the distance between the substation and the line - below which the line will be split - """ + # Spatial join to find lines intersecting point buffers + joined = gpd.sjoin(df_l, buffer_df, how="inner", op="intersects") - lines_to_add = [] # list of lines to be added - lines_to_split = [] # list of lines that have been split + # group lines by their index + group_lines = joined.groupby(level=0) - lines_epsgmod = lines.to_crs(distance_crs) - buses_epsgmod = buses.to_crs(distance_crs) + # iterate over the groups, TODO: change to apply + for i, group in group_lines: + line_id = df_l.loc[i, line_id_str] # pick the line id that represents the group + line_geom = df_l.loc[i, "geometry"] - # set tqdm options for substation ids - tqdm_kwargs_substation_ids = dict( - ascii=False, - unit=" lines", - total=lines.shape[0], - desc="Verify lines overpassing nodes ", - ) + # number of points that intersect with the line + num_points = len(group) - for l in tqdm(lines.index, **tqdm_kwargs_substation_ids): - # bus indices being within tolerance from the line - bus_in_tol_epsg = buses_epsgmod[ - buses_epsgmod.geometry.distance(lines_epsgmod.geometry.loc[l]) <= tol - ] + # get the indices of the points that intersect with the line + points_indexes = group["index_right"].tolist() - # exclude endings of the lines - bus_in_tol_epsg = bus_in_tol_epsg[ - ( - ( - bus_in_tol_epsg.geometry.distance( - lines_epsgmod.geometry.loc[l].boundary.geoms[0] - ) - > tol - ) - | ( - bus_in_tol_epsg.geometry.distance( - lines_epsgmod.geometry.loc[l].boundary.geoms[1] - ) - > tol - ) - ) - ] + # get the geometries of the points that intersect with the line + all_points = df_p.loc[points_indexes, "geometry"] - if not bus_in_tol_epsg.empty: - # add index of line to split - lines_to_split.append(l) + # discard points related to the extrema points (the buses) of each line + distance_from_buses = all_points.distance(line_geom.boundary) + overpassing_points = list(all_points[distance_from_buses > tol]) - buses_locs = buses.geometry.loc[bus_in_tol_epsg.index] + # if no overpassing points are identified, skip iteration + if len(overpassing_points) == 0: + continue - # get new line geometries - new_geometries = _split_linestring_by_point(lines.geometry[l], buses_locs) - n_geoms = len(new_geometries) + # find all the nearest points on the line to the points that intersect with the line + nearest_points_list = [ + nearest_points(line_geom, point)[0] for point in overpassing_points + ] - # create temporary copies of the line - df_append = gpd.GeoDataFrame([lines.loc[l]] * n_geoms) - # update geometries - df_append["geometry"] = new_geometries - # update name of the line - df_append["line_id"] = [ - str(df_append["line_id"].iloc[0]) + f"_{id}" for id in range(n_geoms) - ] + # sort the nearest points based on their distance from the start point of the line + nearest_points_list.sort(key=lambda point: line_geom.project(point)) - lines_to_add.append(df_append) + # split the line at each nearest point using the split function + split_line = [line_geom] + for point in nearest_points_list: + # Split the line at the current point + # The split function returns a GeometryCollection, so we need to convert it to a list + split_lines = split(split_line[-1], point) + split_line = split_line[:-1] + list(split_lines.geoms) - if not lines_to_add: - return lines, buses + # convert the split line to a multilinestring + split_line = MultiLineString(split_line) - df_to_add = gpd.GeoDataFrame(pd.concat(lines_to_add, ignore_index=True)) - df_to_add.set_crs(lines.crs, inplace=True) - df_to_add.set_index(lines.index[-1] + df_to_add.index, inplace=True) + # replace the line with the split line in lines df + df_l.loc[i, "geometry"] = split_line - # update length - df_to_add["length"] = df_to_add.to_crs(distance_crs).geometry.length + # explode the multilinestrings (not recommended, but included for completion) + # exploding the df should be done at the last step + # if an operation requires separate lines, it should be done using df.explode().apply(your_function) + # which is a lot more memory efficient + df_l = df_l.explode(index_parts=True).reset_index() - # update line endings - df_to_add = line_endings_to_bus_conversion(df_to_add) + # revise line_id to account for part index + df_l[line_id_str] = ( + df_l[line_id_str].astype(str) + "_" + df_l["level_1"].astype(str) + ) + df_l.drop(columns=["level_0", "level_1"], inplace=True) + + # update line endings (included for completion, the scope of the function should be limited to fixing overpassing lines) + # commented out due to errors in the bus conversion function + # df_l = line_endings_to_bus_conversion(df_l) - # remove original lines - lines.drop(lines_to_split, inplace=True) + # update length + df_l["length"] = df_l.to_crs(distance_crs).geometry.length - lines = df_to_add if lines.empty else pd.concat([lines, df_to_add]) + # return to original crs + df_l = df_l.to_crs(lines.crs) - lines = gpd.GeoDataFrame(lines.reset_index(drop=True), crs=lines.crs) + # remove lines that are rings (included for completion), TODO: this should be a separate function + df_l = df_l[~df_l.geometry.is_ring].reset_index(drop=True) - return lines, buses + # buses should not be returned as they are not changed, but included for completion + return df_l, buses def force_ac_lines(df, col="tag_frequency"):