idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -84,6 +84,38 @@ LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
|
|||
DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
|
||||
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||
|
||||
# England British National Grid (EPSG:27700) bounding box, with margin. ONS NSPL stores
|
||||
# postcodes that have no grid reference at the Null-Island sentinel lat=99.999999,
|
||||
# long=0.000000, whose paired easting/northing collapse to the grid origin (0, 0) (or
|
||||
# inf). Requiring coordinates inside this box drops the sentinel from every index, so an
|
||||
# active postcode lacking a grid ref can never become a false nearest neighbour.
|
||||
ENGLAND_BNG_MIN_EAST = 50_000.0
|
||||
ENGLAND_BNG_MAX_EAST = 660_000.0
|
||||
ENGLAND_BNG_MIN_NORTH = 0.0
|
||||
ENGLAND_BNG_MAX_NORTH = 660_000.0
|
||||
|
||||
|
||||
def _valid_wgs84_expr() -> pl.Expr:
|
||||
"""Rows with a real lat/long inside England (drops the ONS lat=99.999999, long=0.0
|
||||
no-grid-reference sentinel and any nulls), so they never enter a coordinate index."""
|
||||
return (
|
||||
pl.col("lat").is_not_null()
|
||||
& pl.col("long").is_not_null()
|
||||
& pl.col("lat").is_between(ENGLAND_BBOX_SOUTH, ENGLAND_BBOX_NORTH)
|
||||
& pl.col("long").is_between(ENGLAND_BBOX_WEST, ENGLAND_BBOX_EAST)
|
||||
)
|
||||
|
||||
|
||||
def _valid_bng_expr() -> pl.Expr:
|
||||
"""Rows with a real easting/northing inside England (drops the (0, 0) grid-origin /
|
||||
inf paired with the ONS no-grid-reference sentinel and any nulls)."""
|
||||
return (
|
||||
pl.col("east1m").is_not_null()
|
||||
& pl.col("north1m").is_not_null()
|
||||
& pl.col("east1m").is_between(ENGLAND_BNG_MIN_EAST, ENGLAND_BNG_MAX_EAST)
|
||||
& pl.col("north1m").is_between(ENGLAND_BNG_MIN_NORTH, ENGLAND_BNG_MAX_NORTH)
|
||||
)
|
||||
|
||||
# Suffixes to strip from raw station names before appending the typed suffix.
|
||||
_STATION_STRIP = (
|
||||
" tube station",
|
||||
|
|
@ -303,7 +335,7 @@ def _outcode_tree(postcodes_path: Path) -> tuple[cKDTree, list[str]]:
|
|||
postcodes_path, columns=["pcds", "lat", "long", "ctry25cd", "doterm"]
|
||||
)
|
||||
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||
.filter(pl.col("lat").is_not_null() & pl.col("long").is_not_null())
|
||||
.filter(_valid_wgs84_expr())
|
||||
)
|
||||
coords = np.column_stack(
|
||||
[df["lat"].to_numpy().astype(np.float64), df["long"].to_numpy().astype(np.float64)]
|
||||
|
|
@ -359,12 +391,22 @@ def _build_street_places(
|
|||
return sorted(places, key=lambda place: place["name"].lower())
|
||||
|
||||
|
||||
def _poi_dedup_key(name: str, place_type: str, lat: float, lon: float) -> tuple:
|
||||
"""Geographic de-dup key: round(.,2) is ~1.1km lat / ~0.7km UK lon.
|
||||
|
||||
Coarse enough to collapse the SAME physical POI mapped twice a few metres
|
||||
apart, fine enough to keep genuinely distinct same-named POIs in different
|
||||
towns (e.g. "Victoria Park" in London vs Bristol).
|
||||
"""
|
||||
return (name.lower(), place_type, round(lat, 2), round(lon, 2))
|
||||
|
||||
|
||||
def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
||||
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type)."""
|
||||
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type, coords)."""
|
||||
if pois.is_empty():
|
||||
return []
|
||||
|
||||
seen: set[tuple[str, str]] = set()
|
||||
seen: set[tuple] = set()
|
||||
places: list[dict] = []
|
||||
for row in pois.iter_rows(named=True):
|
||||
place_type = HIGH_VALUE_POI_CATEGORIES.get(str(row.get("category", "")))
|
||||
|
|
@ -373,7 +415,9 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
|||
name = str(row.get("name") or "").strip()
|
||||
if len(name) < 3:
|
||||
continue
|
||||
key = (name.lower(), place_type)
|
||||
lat = float(row["lat"])
|
||||
lon = float(row["lng"])
|
||||
key = _poi_dedup_key(name, place_type, lat, lon)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
|
@ -381,8 +425,8 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
|||
{
|
||||
"name": name,
|
||||
"place_type": place_type,
|
||||
"lat": float(row["lat"]),
|
||||
"lon": float(row["lng"]),
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"population": 0,
|
||||
"travel_destination": False,
|
||||
"display_city": None,
|
||||
|
|
@ -395,11 +439,16 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
|
|||
pois = pl.read_parquet(pois_path, columns=["name", "category", "lat", "lng"])
|
||||
new_places = _pois_to_places(pois)
|
||||
existing = {
|
||||
(str(place["name"]).lower(), place["place_type"]) for place in places
|
||||
_poi_dedup_key(
|
||||
str(place["name"]), place["place_type"], place["lat"], place["lon"]
|
||||
)
|
||||
for place in places
|
||||
}
|
||||
added = 0
|
||||
for place in new_places:
|
||||
key = (place["name"].lower(), place["place_type"])
|
||||
key = _poi_dedup_key(
|
||||
place["name"], place["place_type"], place["lat"], place["lon"]
|
||||
)
|
||||
if key in existing:
|
||||
continue
|
||||
places.append(place)
|
||||
|
|
@ -409,10 +458,14 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
|
|||
|
||||
|
||||
def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
|
||||
df = pl.read_parquet(
|
||||
postcodes_path,
|
||||
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
|
||||
).filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||
df = (
|
||||
pl.read_parquet(
|
||||
postcodes_path,
|
||||
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
|
||||
)
|
||||
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||
.filter(_valid_wgs84_expr())
|
||||
)
|
||||
return {
|
||||
_normalize_postcode(postcode): (float(lat), float(lon))
|
||||
for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
|
||||
|
|
@ -470,7 +523,7 @@ def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
|
|||
.filter(
|
||||
(pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
|
||||
)
|
||||
.filter(pl.col("east1m").is_not_null() & pl.col("north1m").is_not_null())
|
||||
.filter(_valid_bng_expr())
|
||||
.with_columns(_is_london_admin_expr().alias("is_london"))
|
||||
.select("east1m", "north1m", "is_london")
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue