good
This commit is contained in:
parent
81a16f543c
commit
63713c3a2b
15 changed files with 492 additions and 159 deletions
|
|
@ -1,7 +1,8 @@
|
|||
"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet.
|
||||
"""Extract places, stations, and universities → data/places.parquet.
|
||||
|
||||
Extracts named place nodes and railway stations (tube, national rail, DLR,
|
||||
etc.) for typeahead search.
|
||||
etc.) for typeahead search. Official English university providers from the
|
||||
Office for Students register can also be added as travel-time destinations.
|
||||
Reuses the same england-latest.osm.pbf as pois.py.
|
||||
"""
|
||||
|
||||
|
|
@ -53,6 +54,19 @@ _STATION_STRIP = (
|
|||
)
|
||||
|
||||
_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
|
||||
_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
|
||||
|
||||
_NOISY_PROVIDER_SUFFIXES = (
|
||||
" higher education corporation",
|
||||
" limited",
|
||||
" ltd",
|
||||
)
|
||||
|
||||
_LEGAL_NAME_FALLBACK_MARKERS = (
|
||||
"the chancellor",
|
||||
"chancellor, masters",
|
||||
"chancellor masters",
|
||||
)
|
||||
|
||||
|
||||
def _is_dlr_station(tags: dict[str, str]) -> bool:
|
||||
|
|
@ -124,6 +138,170 @@ def _station_name_score(name: str) -> tuple[int, int]:
|
|||
return (suffix_penalty, len(name))
|
||||
|
||||
|
||||
def _cell_text(value: object) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
return str(value).strip()
|
||||
|
||||
|
||||
def _header_key(value: object) -> str:
|
||||
return re.sub(r"[^a-z0-9]+", " ", _cell_text(value).lower()).strip()
|
||||
|
||||
|
||||
def _find_header_row(rows: list[tuple]) -> int:
|
||||
for idx, row in enumerate(rows):
|
||||
keys = [_header_key(value) for value in row]
|
||||
has_legal_name = any(
|
||||
all(token in key for token in ("provider", "legal", "name"))
|
||||
for key in keys
|
||||
)
|
||||
has_university_title = any(
|
||||
all(token in key for token in ("right", "use", "university"))
|
||||
for key in keys
|
||||
)
|
||||
if has_legal_name and has_university_title:
|
||||
return idx
|
||||
raise ValueError("Could not find the OfS register header row")
|
||||
|
||||
|
||||
def _find_column(headers: list[object], *tokens: str) -> int:
|
||||
for idx, header in enumerate(headers):
|
||||
key = _header_key(header)
|
||||
if all(token in key for token in tokens):
|
||||
return idx
|
||||
raise ValueError(f"Could not find OfS register column containing {tokens}")
|
||||
|
||||
|
||||
def _normalize_postcode(postcode: str) -> str:
|
||||
return re.sub(r"[^A-Z0-9]", "", postcode.upper())
|
||||
|
||||
|
||||
def _extract_postcode(address: str) -> str | None:
|
||||
match = _POSTCODE_RE.search(address)
|
||||
if match is None:
|
||||
return None
|
||||
return _normalize_postcode(match.group(1))
|
||||
|
||||
|
||||
def _clean_provider_name(name: str) -> str:
|
||||
name = re.sub(r"\s+", " ", name).strip(" ,")
|
||||
if name.lower().endswith(", the"):
|
||||
name = f"The {name[:-5].strip(' ,')}"
|
||||
for suffix in _NOISY_PROVIDER_SUFFIXES:
|
||||
if name.lower().endswith(suffix):
|
||||
name = name[: -len(suffix)].strip(" ,")
|
||||
break
|
||||
if name.startswith("The ") and name != "The Open University":
|
||||
name = name[4:].strip()
|
||||
return name
|
||||
|
||||
|
||||
def _split_trading_names(trading_names: str) -> list[str]:
|
||||
if not trading_names or trading_names.casefold() == "not applicable":
|
||||
return []
|
||||
return [
|
||||
_clean_provider_name(name)
|
||||
for name in trading_names.splitlines()
|
||||
if _clean_provider_name(name)
|
||||
]
|
||||
|
||||
|
||||
def _needs_trading_name(legal_name: str) -> bool:
|
||||
lower = legal_name.lower()
|
||||
return any(marker in lower for marker in _LEGAL_NAME_FALLBACK_MARKERS) or any(
|
||||
lower.endswith(suffix) for suffix in _NOISY_PROVIDER_SUFFIXES
|
||||
)
|
||||
|
||||
|
||||
def _select_university_name(legal_name: str, trading_names: str) -> str:
|
||||
legal = _clean_provider_name(legal_name)
|
||||
trading = _split_trading_names(trading_names)
|
||||
if _needs_trading_name(legal_name):
|
||||
for name in trading:
|
||||
if "university" in name.lower() or "imperial college" in name.lower():
|
||||
return name
|
||||
if trading:
|
||||
return trading[0]
|
||||
return legal
|
||||
|
||||
|
||||
def _slugify_name(name: str) -> str:
|
||||
slug = name.lower()
|
||||
slug = re.sub(r"[^a-z0-9 -]", "", slug)
|
||||
return re.sub(r"\s+", "-", slug).strip("-")
|
||||
|
||||
|
||||
def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
|
||||
df = pl.read_parquet(
|
||||
postcodes_path,
|
||||
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
|
||||
).filter((pl.col("ctry25cd") == "E92000001") & pl.col("doterm").is_null())
|
||||
return {
|
||||
_normalize_postcode(postcode): (float(lat), float(lon))
|
||||
for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
|
||||
}
|
||||
|
||||
|
||||
def _ofs_universities(
|
||||
raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
|
||||
) -> tuple[list[dict], int]:
|
||||
rows = raw.rows()
|
||||
header_idx = _find_header_row(rows)
|
||||
headers = list(rows[header_idx])
|
||||
legal_idx = _find_column(headers, "provider", "legal", "name")
|
||||
trading_idx = _find_column(headers, "trading", "name")
|
||||
address_idx = _find_column(headers, "contact", "address")
|
||||
university_title_idx = _find_column(headers, "right", "use", "university")
|
||||
|
||||
universities: list[dict] = []
|
||||
skipped = 0
|
||||
for row in rows[header_idx + 1 :]:
|
||||
if _cell_text(row[university_title_idx]).casefold() != "yes":
|
||||
continue
|
||||
|
||||
name = _select_university_name(
|
||||
_cell_text(row[legal_idx]), _cell_text(row[trading_idx])
|
||||
)
|
||||
postcode = _extract_postcode(_cell_text(row[address_idx]))
|
||||
coords = postcode_coords.get(postcode or "")
|
||||
if not name or coords is None:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
lat, lon = coords
|
||||
universities.append(
|
||||
{
|
||||
"name": name,
|
||||
"place_type": "university",
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"population": 0,
|
||||
"travel_destination": True,
|
||||
}
|
||||
)
|
||||
|
||||
return universities, skipped
|
||||
|
||||
|
||||
def _append_ofs_universities(
|
||||
places: list[dict], register_path: Path, postcodes_path: Path
|
||||
) -> tuple[int, int]:
|
||||
postcode_coords = _postcode_lookup(postcodes_path)
|
||||
raw = pl.read_excel(register_path, has_header=False)
|
||||
universities, skipped = _ofs_universities(raw, postcode_coords)
|
||||
|
||||
existing_slugs = {_slugify_name(str(place["name"])) for place in places}
|
||||
added = 0
|
||||
for university in universities:
|
||||
slug = _slugify_name(university["name"])
|
||||
if slug in existing_slugs:
|
||||
continue
|
||||
places.append(university)
|
||||
existing_slugs.add(slug)
|
||||
added += 1
|
||||
return added, skipped
|
||||
|
||||
|
||||
def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
|
||||
"""Extract station-level DLR destinations from NaPTAN access nodes."""
|
||||
df = pl.read_parquet(naptan_path)
|
||||
|
|
@ -293,6 +471,16 @@ def main() -> None:
|
|||
type=Path,
|
||||
help="Optional NaPTAN parquet file used to add DLR station destinations",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--university-register",
|
||||
type=Path,
|
||||
help="Optional OfS register spreadsheet used to add university destinations",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcodes",
|
||||
type=Path,
|
||||
help="Postcode parquet used to geocode OfS university contact postcodes",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pbf_file = args.pbf
|
||||
|
|
@ -313,6 +501,17 @@ def main() -> None:
|
|||
if args.naptan:
|
||||
added = _append_naptan_dlr_stations(handler.places, args.naptan)
|
||||
print(f"Added {added:,} DLR station destinations from NaPTAN")
|
||||
if args.university_register:
|
||||
if not args.postcodes:
|
||||
raise ValueError("--postcodes is required with --university-register")
|
||||
added, skipped = _append_ofs_universities(
|
||||
handler.places, args.university_register, args.postcodes
|
||||
)
|
||||
print(
|
||||
f"Added {added:,} university travel destinations from the OfS register"
|
||||
)
|
||||
if skipped:
|
||||
print(f"Skipped {skipped:,} OfS university rows without usable coordinates")
|
||||
|
||||
if handler.places:
|
||||
df = pl.DataFrame(handler.places)
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ from pipeline.download.places import (
|
|||
_is_dlr_station,
|
||||
_is_tram_station,
|
||||
_naptan_dlr_stations,
|
||||
_ofs_universities,
|
||||
_select_university_name,
|
||||
_station_display_name,
|
||||
)
|
||||
|
||||
|
|
@ -79,3 +81,68 @@ def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
|
|||
assert shadwell["lat"] == (51.51156 + 51.511693) / 2
|
||||
assert shadwell["place_type"] == "station"
|
||||
assert shadwell["travel_destination"] is True
|
||||
|
||||
|
||||
def test_select_university_name_prefers_public_trading_name_for_noisy_legal_name():
|
||||
assert (
|
||||
_select_university_name(
|
||||
"The Chancellor, Masters and Scholars of the University of Oxford",
|
||||
"Oxford University\nThe University of Oxford",
|
||||
)
|
||||
== "Oxford University"
|
||||
)
|
||||
assert (
|
||||
_select_university_name(
|
||||
"Bournemouth University Higher Education Corporation",
|
||||
"Bournemouth University",
|
||||
)
|
||||
== "Bournemouth University"
|
||||
)
|
||||
assert (
|
||||
_select_university_name("The University of Surrey", "Not applicable")
|
||||
== "University of Surrey"
|
||||
)
|
||||
|
||||
|
||||
def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
|
||||
raw_register = pl.DataFrame(
|
||||
[
|
||||
["OfS Register", None, None, None],
|
||||
["Note row", None, None, None],
|
||||
[
|
||||
"Provider's legal name",
|
||||
"Provider's trading name(s)",
|
||||
"Provider's contact address",
|
||||
"Does the provider have the right to use university in its title?",
|
||||
],
|
||||
[
|
||||
"The Chancellor, Masters and Scholars of the University of Oxford",
|
||||
"Oxford University\nThe University of Oxford",
|
||||
"University Offices\nWellington Square\nOxford\nOX1 2JD\nUnited Kingdom",
|
||||
"Yes",
|
||||
],
|
||||
[
|
||||
"Example College",
|
||||
"Not applicable",
|
||||
"Example Street\nLondon\nSW1A 1AA\nUnited Kingdom",
|
||||
"No",
|
||||
],
|
||||
],
|
||||
orient="row",
|
||||
)
|
||||
|
||||
universities, skipped = _ofs_universities(
|
||||
raw_register, {"OX12JD": (51.7585, -1.2643)}
|
||||
)
|
||||
|
||||
assert skipped == 0
|
||||
assert universities == [
|
||||
{
|
||||
"name": "Oxford University",
|
||||
"place_type": "university",
|
||||
"lat": 51.7585,
|
||||
"lon": -1.2643,
|
||||
"population": 0,
|
||||
"travel_destination": True,
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_AREA_COLUMNS,
|
||||
_STATIC_POI_DISTANCE_RENAMES,
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
)
|
||||
|
|
@ -11,9 +9,11 @@ from pipeline.transform.merge import (
|
|||
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
|
||||
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_less_deprived_percentile_expr("Income Score (rate)")
|
||||
).collect()
|
||||
result = (
|
||||
df.lazy()
|
||||
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
|
||||
.collect()
|
||||
)
|
||||
|
||||
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
|
||||
|
||||
|
|
@ -21,28 +21,18 @@ def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
|
|||
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
|
||||
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_less_deprived_percentile_expr("Income Score (rate)")
|
||||
).collect()
|
||||
result = (
|
||||
df.lazy()
|
||||
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
|
||||
.collect()
|
||||
)
|
||||
|
||||
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
|
||||
|
||||
|
||||
def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
||||
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
|
||||
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)")
|
||||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
|
||||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
|
||||
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
||||
|
||||
|
||||
def test_static_poi_distance_columns_are_renamed_to_configured_area_features() -> None:
|
||||
expected = {
|
||||
"parks_nearest_km": "Distance to nearest park (km)",
|
||||
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
|
||||
"cafe_nearest_km": "Distance to nearest cafe (km)",
|
||||
"pub_nearest_km": "Distance to nearest pub (km)",
|
||||
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
|
||||
}
|
||||
|
||||
assert _STATIC_POI_DISTANCE_RENAMES == expected
|
||||
assert set(expected.values()).issubset(_AREA_COLUMNS)
|
||||
|
|
|
|||
|
|
@ -2,45 +2,72 @@ import polars as pl
|
|||
|
||||
from pipeline.utils import fuzzy_join_on_postcode
|
||||
|
||||
POSTCODE = "E14 2DG"
|
||||
|
||||
# Price paid: unique addresses for this postcode
|
||||
pp = (
|
||||
pl.scan_parquet("data/price-paid-complete.parquet")
|
||||
.filter(pl.col("postcode") == POSTCODE)
|
||||
.select("paon", "saon", "street", "postcode")
|
||||
.unique()
|
||||
.sort("saon")
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
).alias("pp_address"),
|
||||
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_id": ["flat", "house", "unmatched"],
|
||||
"left_address": [
|
||||
"Flat 2, 10 High Street",
|
||||
"12 High Street",
|
||||
"99 Other Road",
|
||||
],
|
||||
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_id": ["flat_epc", "house_epc", "other_postcode"],
|
||||
"right_address": [
|
||||
"10 HIGH STREET FLAT 2",
|
||||
"12 High-Street",
|
||||
"99 Other Road",
|
||||
],
|
||||
"right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# EPC: latest inspection per address for this postcode
|
||||
epc = (
|
||||
pl.scan_csv("data/epc/certificates.csv")
|
||||
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
|
||||
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
|
||||
.sort("INSPECTION_DATE", descending=True)
|
||||
.unique("ADDRESS")
|
||||
.sort("ADDRESS")
|
||||
)
|
||||
result = (
|
||||
fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
)
|
||||
.sort("left_id")
|
||||
.collect()
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=pp,
|
||||
right=epc,
|
||||
left_address_col="pp_address",
|
||||
right_address_col="ADDRESS",
|
||||
left_postcode_col="postcode",
|
||||
right_postcode_col="POSTCODE",
|
||||
).collect()
|
||||
assert result.select("left_id", "right_id").to_dicts() == [
|
||||
{"left_id": "flat", "right_id": "flat_epc"},
|
||||
{"left_id": "house", "right_id": "house_epc"},
|
||||
{"left_id": "unmatched", "right_id": None},
|
||||
]
|
||||
|
||||
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
||||
|
||||
print("Testing the matching between EPC and PP addresses")
|
||||
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
|
||||
print(snapshot)
|
||||
def test_fuzzy_join_on_postcode_requires_matching_numbers():
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["10 High Street"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["11 High Street"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == [None]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue