Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -10,6 +10,26 @@ EARTH_RADIUS_KM = 6371.0088
KM_PER_DEGREE_LAT = 111.32
DEFAULT_GRID_SIZE_DEGREES = 0.02
# Generous GB/UK bounding box. The ArcGIS postcode source stores grid-less
# postcodes with a placeholder coordinate (lat=99.999999, lon=0.0); these are
# finite, so an isfinite() check alone lets them through and produces absurd
# ~5,000 km "nearest amenity" distances. Reject anything outside this box so
# such postcodes get NaN distance / zero counts instead of a fabricated value.
UK_LAT_MIN, UK_LAT_MAX = 49.0, 61.5
UK_LON_MIN, UK_LON_MAX = -9.0, 2.5
def valid_uk_coords_mask(lats: np.ndarray, lons: np.ndarray) -> np.ndarray:
"""Boolean mask of coordinates that are finite AND within the UK bbox."""
return (
np.isfinite(lats)
& np.isfinite(lons)
& (lats >= UK_LAT_MIN)
& (lats <= UK_LAT_MAX)
& (lons >= UK_LON_MIN)
& (lons <= UK_LON_MAX)
)
def _build_poi_grid(
pois: pl.DataFrame, grid_size: float = 0.05
@ -43,7 +63,12 @@ def _get_nearby_indices(
grid_size: float = DEFAULT_GRID_SIZE_DEGREES,
) -> np.ndarray | None:
"""Get POI indices from all grid cells intersecting the radius bounding box."""
if not np.isfinite(pc_lat) or not np.isfinite(pc_lon):
if (
not np.isfinite(pc_lat)
or not np.isfinite(pc_lon)
or not (UK_LAT_MIN <= pc_lat <= UK_LAT_MAX)
or not (UK_LON_MIN <= pc_lon <= UK_LON_MAX)
):
return None
lat_delta = radius_km / KM_PER_DEGREE_LAT
@ -182,7 +207,7 @@ def min_distance_per_postcode(
pc_lats = postcodes_df["lat"].to_numpy()
pc_lons = postcodes_df["lon"].to_numpy()
pc_codes = postcodes_df["postcode"].to_list()
valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
valid_pc_mask = valid_uk_coords_mask(pc_lats, pc_lons)
valid_pc_indices = np.flatnonzero(valid_pc_mask)
result_min_dist = {