Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
297
pipeline/check_school_cutoffs.py
Normal file
297
pipeline/check_school_cutoffs.py
Normal file
|
|
@ -0,0 +1,297 @@
|
|||
"""Evaluate modelled school catchment radii against published cutoffs.
|
||||
|
||||
Local authorities publish each school's "last distance offered" in their
|
||||
yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json``
|
||||
holds a scraped sample of those figures (see the collection notes in each
|
||||
file's ``source_url`` fields). This script matches them to the per-school
|
||||
radii emitted by ``pipeline.transform.school_catchments --schools-output``
|
||||
and reports how well the model reproduces reality, so the preference-bonus
|
||||
constants can be calibrated.
|
||||
|
||||
Headline metrics use non-faith schools whose published cutoff was a binding
|
||||
distance. Faith schools are reported separately (their distance criterion
|
||||
applies within faith priority, so published figures aren't comparable), as
|
||||
are "all applicants offered" schools, where the model should ideally show no
|
||||
binding cutoff.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import difflib
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
_NOISE_WORDS = re.compile(
|
||||
r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b"
|
||||
)
|
||||
_NON_ALNUM = re.compile(r"[^a-z0-9 ]")
|
||||
_SCHOOL_WORDS = re.compile(
|
||||
r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b"
|
||||
)
|
||||
|
||||
|
||||
def normalize_name(name: str, strip_school_words: bool = False) -> str:
|
||||
s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "")
|
||||
s = _NON_ALNUM.sub(" ", s)
|
||||
s = _NOISE_WORDS.sub(" ", s)
|
||||
if strip_school_words:
|
||||
s = _SCHOOL_WORDS.sub(" ", s)
|
||||
return " ".join(s.split())
|
||||
|
||||
|
||||
def normalize_la(la: str) -> str:
|
||||
s = _NON_ALNUM.sub(" ", la.lower().replace("&", " and "))
|
||||
return " ".join(s.replace("city of", "").split())
|
||||
|
||||
|
||||
def load_ground_truth(directory: Path) -> pl.DataFrame:
|
||||
rows = []
|
||||
for path in sorted(directory.glob("cutoffs_*.json")):
|
||||
for row in json.loads(path.read_text()):
|
||||
rows.append(
|
||||
{
|
||||
"school_name": row["school_name"],
|
||||
"la": row["la"],
|
||||
"phase": row["phase"],
|
||||
"entry_year": int(row.get("entry_year") or 0),
|
||||
"cutoff_km": (
|
||||
float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None
|
||||
),
|
||||
"all_offered": bool(row.get("all_offered", False)),
|
||||
"faith_school": bool(row.get("faith_school", False)),
|
||||
"school_postcode": row.get("school_postcode"),
|
||||
"source_url": row.get("source_url", ""),
|
||||
}
|
||||
)
|
||||
if not rows:
|
||||
raise SystemExit(f"No cutoffs_*.json files with rows under {directory}")
|
||||
df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8})
|
||||
print(f"Ground truth rows: {len(df)} from {directory}")
|
||||
return df
|
||||
|
||||
|
||||
def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Attach GIAS URNs to ground-truth rows by postcode, then name."""
|
||||
def stripped(name: str) -> str:
|
||||
return normalize_name(name, strip_school_words=True)
|
||||
|
||||
gias = gias.with_columns(
|
||||
pl.col("name")
|
||||
.map_elements(normalize_name, return_dtype=pl.Utf8)
|
||||
.alias("_name_norm"),
|
||||
pl.col("name")
|
||||
.map_elements(stripped, return_dtype=pl.Utf8)
|
||||
.alias("_name_stripped"),
|
||||
pl.col("local_authority")
|
||||
.map_elements(normalize_la, return_dtype=pl.Utf8)
|
||||
.alias("_la_norm"),
|
||||
pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"),
|
||||
)
|
||||
truth = truth.with_columns(
|
||||
pl.col("school_name")
|
||||
.map_elements(normalize_name, return_dtype=pl.Utf8)
|
||||
.alias("_name_norm"),
|
||||
pl.col("school_name")
|
||||
.map_elements(stripped, return_dtype=pl.Utf8)
|
||||
.alias("_name_stripped"),
|
||||
pl.col("la")
|
||||
.map_elements(normalize_la, return_dtype=pl.Utf8)
|
||||
.alias("_la_norm"),
|
||||
pl.col("school_postcode")
|
||||
.str.replace_all(" ", "")
|
||||
.str.to_uppercase()
|
||||
.alias("_pc"),
|
||||
).with_row_index("_row_id")
|
||||
|
||||
# 1. Exact postcode match (unique postcodes only — site-sharing schools
|
||||
# would mismatch phases otherwise; those fall through to name matching).
|
||||
pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique(
|
||||
subset="_pc", keep="none"
|
||||
)
|
||||
by_pc = truth.filter(pl.col("_pc").is_not_null()).join(
|
||||
pc_unique.select("_pc", "urn"), on="_pc", how="inner"
|
||||
)
|
||||
matched_ids = set(by_pc["_row_id"].to_list())
|
||||
|
||||
# 2. Exact normalized (name, LA) match, unique on both sides.
|
||||
gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none")
|
||||
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
|
||||
by_name = remaining.join(
|
||||
gias_named.select("_name_norm", "_la_norm", "urn"),
|
||||
on=["_name_norm", "_la_norm"],
|
||||
how="inner",
|
||||
)
|
||||
matched_ids |= set(by_name["_row_id"].to_list())
|
||||
|
||||
# 3. Reports often print informal names ("Ashmole Primary" for "Ashmole
|
||||
# Primary School"): match on names with school-type words stripped,
|
||||
# unique on both sides so site-sharing infant/junior pairs fall through.
|
||||
gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique(
|
||||
subset=["_name_stripped", "_la_norm"], keep="none"
|
||||
)
|
||||
remaining = truth.filter(
|
||||
(~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "")
|
||||
).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none")
|
||||
by_stripped = remaining.join(
|
||||
gias_stripped.select("_name_stripped", "_la_norm", "urn"),
|
||||
on=["_name_stripped", "_la_norm"],
|
||||
how="inner",
|
||||
)
|
||||
matched_ids |= set(by_stripped["_row_id"].to_list())
|
||||
|
||||
# 4. Fuzzy name match within the LA: unique best candidate >= 0.87.
|
||||
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
|
||||
fuzzy_rows = []
|
||||
gias_by_la: dict[str, pl.DataFrame] = {}
|
||||
for row in remaining.iter_rows(named=True):
|
||||
la = row["_la_norm"]
|
||||
if la not in gias_by_la:
|
||||
gias_by_la[la] = gias.filter(pl.col("_la_norm") == la)
|
||||
candidates = gias_by_la[la]
|
||||
if candidates.is_empty():
|
||||
continue
|
||||
scores = [
|
||||
difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio()
|
||||
for cand in candidates["_name_norm"].to_list()
|
||||
]
|
||||
order = np.argsort(scores)[::-1]
|
||||
if scores[order[0]] >= 0.87 and (
|
||||
len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04
|
||||
):
|
||||
fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]})
|
||||
by_fuzzy = (
|
||||
pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32))
|
||||
if fuzzy_rows
|
||||
else None
|
||||
)
|
||||
|
||||
parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else [])
|
||||
matched = pl.concat(
|
||||
[p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()]
|
||||
).unique(subset="_row_id", keep="first")
|
||||
print(
|
||||
f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs "
|
||||
f"(postcode {len(by_pc)}, exact name {len(by_name)}, "
|
||||
f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})"
|
||||
)
|
||||
return matched
|
||||
|
||||
|
||||
def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame:
|
||||
joined = matched.join(radii, on=["urn", "phase"], how="inner")
|
||||
print(f"Joined to modelled radii: {len(joined)} rows")
|
||||
|
||||
# Published figures occasionally include non-typical admits (a child who
|
||||
# moved mid-process can print as hundreds of km); cap at distances a
|
||||
# distance criterion can plausibly produce.
|
||||
binding = joined.filter(
|
||||
~pl.col("all_offered")
|
||||
& pl.col("cutoff_km").is_between(0.05, 20.0)
|
||||
)
|
||||
|
||||
def report(df: pl.DataFrame, label: str) -> None:
|
||||
if df.is_empty():
|
||||
print(f"\n{label}: no rows")
|
||||
return
|
||||
truth_km = df["cutoff_km"].to_numpy()
|
||||
model_km = df["radius_km"].to_numpy()
|
||||
log_ratio = np.log2(model_km / truth_km)
|
||||
within2 = float(np.mean(np.abs(log_ratio) <= 1))
|
||||
rank = (
|
||||
pl.DataFrame({"t": truth_km, "m": model_km})
|
||||
.select(pl.corr("t", "m", method="spearman"))
|
||||
.item()
|
||||
)
|
||||
print(
|
||||
f"\n{label} (n={len(df)}):\n"
|
||||
f" median bias (log2 model/truth): {np.median(log_ratio):+.2f} "
|
||||
f"(x{2 ** np.median(log_ratio):.2f})\n"
|
||||
f" median |log2 error|: {np.median(np.abs(log_ratio)):.2f} "
|
||||
f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n"
|
||||
f" within factor 2: {within2:.0%}\n"
|
||||
f" Spearman rank corr: {rank:.2f}"
|
||||
)
|
||||
|
||||
for phase in ("primary", "secondary"):
|
||||
report(
|
||||
binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")),
|
||||
f"BINDING, non-faith, {phase}",
|
||||
)
|
||||
report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)")
|
||||
|
||||
offered = joined.filter(pl.col("all_offered"))
|
||||
if not offered.is_empty():
|
||||
unbound_share = float((~offered["filled"]).mean())
|
||||
print(
|
||||
f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding "
|
||||
f"cutoff for {unbound_share:.0%}; median modelled radius "
|
||||
f"{offered['radius_km'].median():.2f} km"
|
||||
)
|
||||
return binding
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compare modelled catchment radii with published cutoffs"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ground-truth-dir",
|
||||
type=Path,
|
||||
default=Path("property-data/ground_truth"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--radii",
|
||||
type=Path,
|
||||
default=Path("property-data/school_catchment_radii.parquet"),
|
||||
help="Per-school radii parquet from school_catchments --schools-output",
|
||||
)
|
||||
parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet"))
|
||||
parser.add_argument(
|
||||
"--matched-out",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional CSV of matched rows for inspection",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
truth = load_ground_truth(args.ground_truth_dir)
|
||||
# One row per school+phase: keep the most recent entry year.
|
||||
truth = (
|
||||
truth.sort("entry_year", descending=True)
|
||||
.unique(subset=["school_name", "la", "phase"], keep="first")
|
||||
)
|
||||
gias = pl.read_parquet(args.gias).select(
|
||||
"urn", "name", "postcode", "local_authority", "religious_character"
|
||||
)
|
||||
radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first")
|
||||
|
||||
matched = match_schools(truth, gias.drop("religious_character"))
|
||||
# GIAS religious character is authoritative; the scraped name-based flag
|
||||
# only covers rows that failed to match.
|
||||
matched = matched.join(
|
||||
gias.select("urn", "religious_character"), on="urn", how="left"
|
||||
).with_columns(
|
||||
pl.when(pl.col("religious_character").is_not_null())
|
||||
.then(~pl.col("religious_character").is_in(["None", "Does not apply"]))
|
||||
.otherwise(pl.col("faith_school"))
|
||||
.alias("faith_school")
|
||||
)
|
||||
binding = evaluate(matched, radii)
|
||||
|
||||
if args.matched_out is not None:
|
||||
out = matched.join(radii, on=["urn", "phase"], how="inner").drop(
|
||||
"_row_id", "_name_norm", "_la_norm", "_pc"
|
||||
)
|
||||
args.matched_out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_csv(args.matched_out)
|
||||
print(f"\nWrote matched rows to {args.matched_out}")
|
||||
|
||||
if binding.is_empty():
|
||||
raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
93
pipeline/download/lsoa_children.py
Normal file
93
pipeline/download/lsoa_children.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
"""Download Census 2021 children by five-year age band per LSOA.
|
||||
|
||||
Source: NOMIS (ONS Census 2021 — TS007A dataset, age by five-year bands)
|
||||
License: Open Government Licence v3.0
|
||||
|
||||
Used to estimate how many primary-age (4-10) and secondary-age (11-15)
|
||||
children live in each LSOA, which drives the school catchment model. Census
|
||||
bands don't align with school phases, so phase totals take fractional shares
|
||||
of the 0-4, 10-14 and 15-19 bands (one fifth per single year of age).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
|
||||
# NOMIS API: Census 2021 TS007A (age, five-year bands) by LSOA 2021 (TYPE151).
|
||||
# c2021_age_19 codes: 1 = 0-4, 2 = 5-9, 3 = 10-14, 4 = 15-19.
|
||||
# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset.
|
||||
BASE_URL = (
|
||||
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv"
|
||||
"?date=latest&geography=TYPE151&measures=20100&c2021_age_19=1,2,3,4"
|
||||
"&select=GEOGRAPHY_CODE,C2021_AGE_19,OBS_VALUE"
|
||||
)
|
||||
PAGE_SIZE = 25000
|
||||
|
||||
AGE_BAND_COLUMNS = {
|
||||
1: "aged_0_4",
|
||||
2: "aged_5_9",
|
||||
3: "aged_10_14",
|
||||
4: "aged_15_19",
|
||||
}
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading Census 2021 LSOA age bands from NOMIS...")
|
||||
frames = []
|
||||
offset = 0
|
||||
while True:
|
||||
url = f"{BASE_URL}&recordoffset={offset}"
|
||||
response = httpx.get(url, follow_redirects=True, timeout=120)
|
||||
response.raise_for_status()
|
||||
if len(response.content) == 0:
|
||||
break
|
||||
chunk = pl.read_csv(BytesIO(response.content))
|
||||
if chunk.height == 0:
|
||||
break
|
||||
frames.append(chunk)
|
||||
print(f" Fetched {chunk.height} rows (offset={offset})")
|
||||
if chunk.height < PAGE_SIZE:
|
||||
break
|
||||
offset += PAGE_SIZE
|
||||
|
||||
df = pl.concat(frames)
|
||||
print(f"Total rows: {df.height}")
|
||||
|
||||
result = (
|
||||
df.rename({"GEOGRAPHY_CODE": "lsoa21"})
|
||||
.pivot(on="C2021_AGE_19", index="lsoa21", values="OBS_VALUE")
|
||||
.rename({str(code): name for code, name in AGE_BAND_COLUMNS.items()})
|
||||
.with_columns(pl.col(name).cast(pl.UInt32) for name in AGE_BAND_COLUMNS.values())
|
||||
.filter(pl.col("lsoa21").str.starts_with("E"))
|
||||
.sort("lsoa21")
|
||||
)
|
||||
|
||||
missing = [c for c in AGE_BAND_COLUMNS.values() if c not in result.columns]
|
||||
if missing:
|
||||
raise ValueError(f"NOMIS response missing age bands: {missing}")
|
||||
|
||||
print(f"England LSOAs: {result.height}")
|
||||
for name in AGE_BAND_COLUMNS.values():
|
||||
print(f" {name}: total {result[name].sum():,}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download Census 2021 age bands (children) by LSOA"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
download_and_convert(args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -12,8 +12,18 @@ import polars as pl
|
|||
|
||||
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
||||
TUBE_STATION_CATEGORY = "Tube station"
|
||||
TRAM_METRO_CATEGORY = "Tram & Metro stop"
|
||||
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
|
||||
|
||||
# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
|
||||
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
|
||||
# LU stations outside Greater London such as Epping or Amersham), then "0"
|
||||
# (platform/entrance node) or "G" (station group node), then the system code.
|
||||
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
|
||||
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
|
||||
# WM Metro, Blackpool Tramway, heritage railways, ...).
|
||||
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
|
||||
|
||||
|
||||
STOP_TYPES = {
|
||||
"AIR": "Airport",
|
||||
|
|
@ -25,25 +35,110 @@ STOP_TYPES = {
|
|||
"RLY": "Rail station",
|
||||
"RSE": "Rail station",
|
||||
"BCT": "Bus stop",
|
||||
# Bus/coach stations: BST is the station access-area node, BCS/BCQ are
|
||||
# bays/stands within the station and BCE is a station entrance. NaPTAN maps
|
||||
# very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
|
||||
# so sparse that 20% of England showed the nearest bus station >100km away.
|
||||
# Bays and entrances collapse to one POI per station via
|
||||
# STATION_MERGE_CATEGORIES below.
|
||||
"BST": "Bus station",
|
||||
"BCS": "Bus station",
|
||||
"BCQ": "Bus station",
|
||||
"BCE": "Bus station",
|
||||
"TXR": "Taxi rank",
|
||||
"TMU": "Tube station",
|
||||
"MET": "Tube station",
|
||||
# Tram/Metro/Underground: TMU is an entrance node, MET the station access
|
||||
# area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
|
||||
# mark them as London Underground (ZZLU) are reclassified to "Tube station"
|
||||
# after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
|
||||
# Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
|
||||
# "heritage" flag, so they remain in "Tram & Metro stop".
|
||||
"TMU": TRAM_METRO_CATEGORY,
|
||||
"MET": TRAM_METRO_CATEGORY,
|
||||
}
|
||||
|
||||
# Stop types that are access/entrance nodes rather than the primary station or
|
||||
# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
|
||||
# with both a station node and entrances yields one POI at the station node.
|
||||
ENTRANCE_STOP_TYPES = {"RSE", "FTD"}
|
||||
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
|
||||
# station with both a station node and entrances yields one POI at the station
|
||||
# node.
|
||||
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}
|
||||
|
||||
# Categories whose entrances/variants are merged into a single station-level POI
|
||||
# by normalized name + area (like Tube stations), so an RLY node and its RSE
|
||||
# entrances collapse to one POI at the station node.
|
||||
STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}
|
||||
STATION_MERGE_CATEGORIES = {
|
||||
TRAM_METRO_CATEGORY,
|
||||
TUBE_STATION_CATEGORY,
|
||||
"Rail station",
|
||||
"Ferry",
|
||||
"Bus station",
|
||||
}
|
||||
|
||||
|
||||
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
|
||||
|
||||
# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
|
||||
# "West Station Entrance", ...) are stripped from canonical names so a
|
||||
# station's individually-named entrance nodes collapse into the station.
|
||||
# A trailing run of filler words is only stripped when it contains at least
|
||||
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
|
||||
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
|
||||
_ENTRANCE_FILLER_WORDS = {
|
||||
"north",
|
||||
"south",
|
||||
"east",
|
||||
"west",
|
||||
"ne",
|
||||
"nw",
|
||||
"se",
|
||||
"sw",
|
||||
"n",
|
||||
"s",
|
||||
"e",
|
||||
"w",
|
||||
"wt",
|
||||
"main",
|
||||
"side",
|
||||
"no",
|
||||
"station",
|
||||
"stop",
|
||||
"platform",
|
||||
}
|
||||
|
||||
_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
|
||||
_ENTRANCE_FILLER_RE = (
|
||||
r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
|
||||
r"|platform|\d+)"
|
||||
)
|
||||
_ENTRANCE_SUFFIX_RE = (
|
||||
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
|
||||
rf"\s+{_ENTRANCE_WORDS_RE}"
|
||||
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
|
||||
)
|
||||
|
||||
# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
|
||||
# stripped so every bay of one station shares a canonical name. The designator
|
||||
# word must be followed by a short alphanumeric token, so place names ending in
|
||||
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
|
||||
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
|
||||
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
|
||||
|
||||
|
||||
def _strip_entrance_suffix(words: list[str]) -> list[str]:
|
||||
"""Drop a trailing entrance designator (direction/number filler around an
|
||||
entrance word) from a tokenized stop name; no-op when no entrance word."""
|
||||
idx = len(words)
|
||||
saw_entrance = False
|
||||
while idx > 0:
|
||||
word = words[idx - 1]
|
||||
if word in _ENTRANCE_NAME_WORDS:
|
||||
saw_entrance = True
|
||||
elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
idx -= 1
|
||||
return words[:idx] if saw_entrance else words
|
||||
|
||||
|
||||
def canonical_station_name(name: str | None) -> str:
|
||||
"""Normalize station names so entrances/transport-mode variants collapse."""
|
||||
|
|
@ -55,18 +150,24 @@ def canonical_station_name(name: str | None) -> str:
|
|||
normalized = re.sub(r"['’`]", "", normalized)
|
||||
normalized = normalized.replace("&", " and ")
|
||||
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
|
||||
words = normalized.split()
|
||||
words = _strip_entrance_suffix(normalized.split())
|
||||
|
||||
if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
|
||||
del words[-2:]
|
||||
|
||||
suffixes = (
|
||||
("underground", "station"),
|
||||
("tube", "station"),
|
||||
("dlr", "station"),
|
||||
("metro", "station"),
|
||||
("metrolink", "station"),
|
||||
("metrolink", "stop"),
|
||||
("tram", "stop"),
|
||||
("rail", "station"),
|
||||
("railway", "station"),
|
||||
("station",),
|
||||
("stop",),
|
||||
("metrolink",),
|
||||
)
|
||||
while True:
|
||||
suffix = next(
|
||||
|
|
@ -88,11 +189,14 @@ def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
|
|||
expr = expr.str.replace_all(r"&", " and ")
|
||||
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
|
||||
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
|
||||
expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
|
||||
expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
|
||||
expr = expr.str.replace_all(
|
||||
r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
|
||||
r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
|
||||
)
|
||||
expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
|
||||
expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
|
||||
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
|
||||
expr = expr.str.replace_all(r"\s+metrolink$", "")
|
||||
return expr.str.strip_chars()
|
||||
|
||||
|
||||
|
|
@ -140,6 +244,7 @@ class StationAccumulator:
|
|||
lat_sum: float
|
||||
lng_sum: float
|
||||
entrance: bool = False
|
||||
is_lu: bool = False
|
||||
count: int = 1
|
||||
|
||||
@property
|
||||
|
|
@ -159,6 +264,7 @@ class StationAccumulator:
|
|||
self.lat_sum += float(row["lat"])
|
||||
self.lng_sum += float(row["lng"])
|
||||
self.count += 1
|
||||
self.is_lu = self.is_lu or bool(row.get("is_lu"))
|
||||
|
||||
name = str(row["name"] or "")
|
||||
entrance = bool(row.get("entrance"))
|
||||
|
|
@ -169,6 +275,16 @@ class StationAccumulator:
|
|||
self.name = name
|
||||
self.entrance = entrance
|
||||
|
||||
@property
|
||||
def output_category(self) -> str:
|
||||
# A merged tram/metro station is a genuine Tube station when ANY of its
|
||||
# constituent nodes carries a London Underground ATCO code. Checking
|
||||
# the whole group (not just the winning node) matters because LU
|
||||
# entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
|
||||
if self.category == TRAM_METRO_CATEGORY and self.is_lu:
|
||||
return TUBE_STATION_CATEGORY
|
||||
return self.category
|
||||
|
||||
|
||||
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
||||
return StationAccumulator(
|
||||
|
|
@ -178,6 +294,7 @@ def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
|||
lat_sum=float(row["lat"]),
|
||||
lng_sum=float(row["lng"]),
|
||||
entrance=bool(row.get("entrance")),
|
||||
is_lu=bool(row.get("is_lu")),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -217,7 +334,7 @@ def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
|
|||
{
|
||||
"id": [station.id for station in selected],
|
||||
"name": [station.name for station in selected],
|
||||
"category": [station.category for station in selected],
|
||||
"category": [station.output_category for station in selected],
|
||||
"lat": [station.lat for station in selected],
|
||||
"lng": [station.lng for station in selected],
|
||||
}
|
||||
|
|
@ -258,10 +375,12 @@ def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
|
|||
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
|
||||
|
||||
Tube, rail and ferry POIs are merged to one record per station by
|
||||
normalized name + area, with the primary station/terminal node (e.g. RLY,
|
||||
FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
|
||||
by exact name+category+locality.
|
||||
Tram/metro, rail, ferry and bus-station POIs are merged to one record per
|
||||
station by normalized name + area, with the primary station/terminal node
|
||||
(e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
|
||||
BCE). Merged tram/metro stations with a London Underground ATCO code in
|
||||
the group become "Tube station". Other stops are deduplicated by exact
|
||||
name+category+locality.
|
||||
"""
|
||||
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||||
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||||
|
|
@ -274,6 +393,29 @@ def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
|
|||
).select(OUTPUT_COLUMNS)
|
||||
|
||||
|
||||
def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Keep only active NaPTAN stops.
|
||||
|
||||
The NaPTAN export's Status column marks stops as active/inactive/pending;
|
||||
without this filter closed stations ("(closed)", "not in use") ship as
|
||||
live POIs. Rows with a null Status are kept (benefit of the doubt); a
|
||||
missing column is tolerated so older extracts still load.
|
||||
"""
|
||||
if "Status" not in df.columns:
|
||||
print("WARNING: NaPTAN data has no Status column; keeping all stops")
|
||||
return df
|
||||
|
||||
before = len(df)
|
||||
df = df.filter(
|
||||
pl.col("Status").is_null()
|
||||
| pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
|
||||
)
|
||||
dropped = before - len(df)
|
||||
if dropped:
|
||||
print(f"Dropped {dropped:,} non-active stops (Status != active)")
|
||||
return df
|
||||
|
||||
|
||||
def download_naptan(output: Path) -> None:
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
@ -291,15 +433,19 @@ def download_naptan(output: Path) -> None:
|
|||
)
|
||||
.drop_nulls(subset=["Latitude", "Longitude"])
|
||||
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
|
||||
.select(
|
||||
pl.col("ATCOCode").alias("id"),
|
||||
pl.col("CommonName").alias("name"),
|
||||
pl.col("StopType").replace(STOP_TYPES).alias("category"),
|
||||
pl.col("Latitude").alias("lat"),
|
||||
pl.col("Longitude").alias("lng"),
|
||||
pl.col("NptgLocalityCode").alias("locality"),
|
||||
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
|
||||
)
|
||||
)
|
||||
df = filter_active_stops(df).select(
|
||||
pl.col("ATCOCode").alias("id"),
|
||||
pl.col("CommonName").alias("name"),
|
||||
pl.col("StopType").replace(STOP_TYPES).alias("category"),
|
||||
pl.col("Latitude").alias("lat"),
|
||||
pl.col("Longitude").alias("lng"),
|
||||
pl.col("NptgLocalityCode").alias("locality"),
|
||||
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
|
||||
pl.col("ATCOCode")
|
||||
.str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
|
||||
.fill_null(False)
|
||||
.alias("is_lu"),
|
||||
)
|
||||
|
||||
before = len(df)
|
||||
|
|
|
|||
|
|
@ -2,12 +2,15 @@
|
|||
|
||||
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
|
||||
access point locations (park entrances). Each access point is tagged with
|
||||
its parent site's function type (e.g. Public Park Or Garden). Sites without
|
||||
access points fall back to polygon centroids.
|
||||
its parent site's function type (e.g. Public Park Or Garden), the parent
|
||||
site id and the site's polygon centroid. Sites without access points fall
|
||||
back to polygon centroids.
|
||||
|
||||
Using access points rather than polygon centroids gives much more accurate
|
||||
distance calculations — a property next to Hyde Park won't show 400m just
|
||||
because the centroid is in the middle of the park.
|
||||
because the centroid is in the middle of the park. The site id / centroid
|
||||
columns let downstream consumers (poi_proximity) collapse the frame back to
|
||||
one row per SITE for counting, so a park with 30 gates counts as one park.
|
||||
|
||||
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
|
||||
License: Open Government Licence v3.0
|
||||
|
|
@ -65,8 +68,8 @@ def _read_site_functions(shp_path: Path) -> dict[str, str]:
|
|||
|
||||
def _read_access_points(
|
||||
shp_path: Path, site_funcs: dict[str, str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read access points, tagging each with its parent site's function."""
|
||||
) -> tuple[list[float], list[float], list[str], list[str]]:
|
||||
"""Read access points, tagging each with its parent site's function and id."""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
|
||||
|
|
@ -80,6 +83,7 @@ def _read_access_points(
|
|||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
site_ids: list[str] = []
|
||||
skipped = 0
|
||||
error_skipped = 0
|
||||
|
||||
|
|
@ -107,6 +111,7 @@ def _read_access_points(
|
|||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
site_ids.append(str(site_id))
|
||||
|
||||
if skipped:
|
||||
print(f" Skipped {skipped:,} access points with unknown site ID")
|
||||
|
|
@ -116,31 +121,26 @@ def _read_access_points(
|
|||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
return lats, lngs, categories, site_ids
|
||||
|
||||
|
||||
def _read_site_centroids(
|
||||
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read polygon centroids for sites that have no access points (fallback)."""
|
||||
def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
|
||||
"""Compute the WGS84 polygon centroid of every greenspace site.
|
||||
|
||||
Used both as the representative point for site-level counting and as the
|
||||
location fallback for sites that have no access points.
|
||||
"""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
id_idx = _find_field(field_names, "id")
|
||||
func_idx = _find_field(field_names, "funct")
|
||||
if id_idx is None or func_idx is None:
|
||||
return [], [], []
|
||||
if id_idx is None:
|
||||
return {}
|
||||
|
||||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
centroids: dict[str, tuple[float, float]] = {}
|
||||
error_skipped = 0
|
||||
|
||||
for sr in reader.shapeRecords():
|
||||
site_id = sr.record[id_idx]
|
||||
if site_id in covered_ids:
|
||||
continue
|
||||
|
||||
func = sr.record[func_idx]
|
||||
try:
|
||||
geom = to_shapely(sr.shape.__geo_interface__)
|
||||
if geom.is_empty or not geom.is_valid:
|
||||
|
|
@ -156,9 +156,7 @@ def _read_site_centroids(
|
|||
)
|
||||
continue
|
||||
|
||||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
centroids[str(site_id)] = (lat, lng)
|
||||
|
||||
if error_skipped:
|
||||
logger.warning(
|
||||
|
|
@ -166,7 +164,7 @@ def _read_site_centroids(
|
|||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
return centroids
|
||||
|
||||
|
||||
def download_greenspace(output: Path) -> None:
|
||||
|
|
@ -194,33 +192,53 @@ def download_greenspace(output: Path) -> None:
|
|||
|
||||
# Step 2: Read access points (primary — park entrances)
|
||||
print(f"Reading {access_shps[0].name}...")
|
||||
ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
|
||||
ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
|
||||
access_shps[0], site_funcs
|
||||
)
|
||||
print(f" {len(ap_lats):,} access points loaded")
|
||||
|
||||
# Step 3: Fall back to centroids for sites without any access points
|
||||
covered_ids = set()
|
||||
reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
|
||||
if ref_idx is not None:
|
||||
for rec in reader.iterRecords():
|
||||
covered_ids.add(rec[ref_idx])
|
||||
# Step 3: Compute every site's centroid: the representative point for
|
||||
# site-level counting, and the location fallback for sites without any
|
||||
# access points.
|
||||
print("Computing site centroids...")
|
||||
centroids = _read_site_centroids(site_shps[0])
|
||||
print(f" {len(centroids):,} site centroids computed")
|
||||
|
||||
print("Adding centroids for sites without access points...")
|
||||
fb_lats, fb_lngs, fb_cats = _read_site_centroids(
|
||||
site_shps[0], site_funcs, covered_ids
|
||||
)
|
||||
covered_ids = set(ap_site_ids)
|
||||
fb_lats: list[float] = []
|
||||
fb_lngs: list[float] = []
|
||||
fb_cats: list[str] = []
|
||||
fb_site_ids: list[str] = []
|
||||
for site_id, (lat, lng) in centroids.items():
|
||||
if site_id in covered_ids:
|
||||
continue
|
||||
func = site_funcs.get(site_id)
|
||||
if func is None:
|
||||
continue
|
||||
fb_lats.append(lat)
|
||||
fb_lngs.append(lng)
|
||||
fb_cats.append(func)
|
||||
fb_site_ids.append(site_id)
|
||||
print(f" {len(fb_lats):,} centroid fallbacks added")
|
||||
|
||||
lats = ap_lats + fb_lats
|
||||
lngs = ap_lngs + fb_lngs
|
||||
categories = ap_cats + fb_cats
|
||||
site_ids = ap_site_ids + fb_site_ids
|
||||
site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
|
||||
site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"lat": np.array(lats, dtype=np.float64),
|
||||
"lng": np.array(lngs, dtype=np.float64),
|
||||
"category": categories,
|
||||
"site_id": site_ids,
|
||||
# Site polygon centroid (null when the centroid could not be
|
||||
# computed): the representative point when collapsing to one row
|
||||
# per site for counting.
|
||||
"site_lat": pl.Series(site_lats, dtype=pl.Float64),
|
||||
"site_lng": pl.Series(site_lngs, dtype=pl.Float64),
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -641,7 +641,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
|
|||
match = _DLR_CODE_RE.search(atco_id)
|
||||
if not match:
|
||||
continue
|
||||
if row["category"] not in {"Tube station", "Rail station"}:
|
||||
if row["category"] not in {"Tube station", "Tram & Metro stop", "Rail station"}:
|
||||
continue
|
||||
|
||||
code = match.group(1)
|
||||
|
|
|
|||
|
|
@ -2,9 +2,12 @@ import polars as pl
|
|||
import pytest
|
||||
|
||||
from pipeline.download.naptan import (
|
||||
TRAM_METRO_CATEGORY,
|
||||
TUBE_STATION_CATEGORY,
|
||||
canonical_station_name,
|
||||
canonical_station_name_expr,
|
||||
deduplicate_naptan,
|
||||
filter_active_stops,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -34,6 +37,127 @@ def test_canonical_station_name_expr_normalizes_transport_suffixes():
|
|||
assert [canonical_station_name(name) for name in names] == result
|
||||
|
||||
|
||||
def test_canonical_station_name_strips_entrance_suffixes():
|
||||
# Real shipped NaPTAN entrance names that previously failed to merge with
|
||||
# their station node (79 stray entrance POIs).
|
||||
cases = {
|
||||
"Weaste Metrolink Station North East Entrance": "weaste",
|
||||
"Weaste Metrolink Station North Entrance No 2": "weaste",
|
||||
"Whitefield Metrolink Station Main Entrance": "whitefield",
|
||||
"Radcliffe Metrolink Station Entrance": "radcliffe",
|
||||
"Stretford Metrolink Station Wt Platform Entrance": "stretford",
|
||||
"Salford Quays Metrolink Station SW entrance": "salford quays",
|
||||
"Bank Station Ent 2": "bank",
|
||||
"Hainault": "hainault",
|
||||
# The Metrolink MET node names collapse to the same key.
|
||||
"Weaste (Manchester Metrolink)": "weaste",
|
||||
# No entrance word: direction/filler words must NOT be stripped.
|
||||
"Maze Hill North": "maze hill north",
|
||||
"Bus Station Entrance": "bus",
|
||||
# Bus-station bay/stand designators collapse to the station name…
|
||||
"Tonypandy Bus Station Stand A3": "tonypandy bus",
|
||||
"Caerphilly Interchange Stand 5": "caerphilly interchange",
|
||||
"Stanley Bus Station Stand G": "stanley bus",
|
||||
# …but a bare trailing "Bay" (place names) is untouched.
|
||||
"Colwyn Bay": "colwyn bay",
|
||||
}
|
||||
for name, expected in cases.items():
|
||||
assert canonical_station_name(name) == expected, name
|
||||
|
||||
df = pl.DataFrame({"name": list(cases.keys())})
|
||||
expr_result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
|
||||
assert expr_result == list(cases.values())
|
||||
|
||||
|
||||
def test_filter_active_stops_drops_non_active():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"ATCOCode": ["a", "b", "c", "d"],
|
||||
"Status": ["active", "inactive", None, "Pending"],
|
||||
}
|
||||
)
|
||||
|
||||
result = filter_active_stops(df)
|
||||
|
||||
# Active and unknown (null) statuses survive; inactive/pending are dropped.
|
||||
assert result["ATCOCode"].to_list() == ["a", "c"]
|
||||
|
||||
|
||||
def test_filter_active_stops_tolerates_missing_status_column():
|
||||
df = pl.DataFrame({"ATCOCode": ["a"]})
|
||||
|
||||
assert filter_active_stops(df)["ATCOCode"].to_list() == ["a"]
|
||||
|
||||
|
||||
def test_deduplicate_naptan_splits_london_underground_from_tram_metro():
|
||||
# MET station nodes plus TMU entrances, pre-categorised as the tram/metro
|
||||
# family. The Hainault group contains a 940GZZLU station node, so the
|
||||
# merged POI is a genuine "Tube station" even though its entrance carries a
|
||||
# non-ZZLU ATCO code; the Metrolink group stays "Tram & Metro stop".
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [
|
||||
"940GZZLUHLT",
|
||||
"490000095003",
|
||||
"9400ZZMAWST",
|
||||
"1800NFR2691",
|
||||
],
|
||||
"name": [
|
||||
"Hainault Underground Station",
|
||||
"Hainault",
|
||||
"Weaste (Manchester Metrolink)",
|
||||
"Weaste Metrolink Station North West Entrance",
|
||||
],
|
||||
"category": [TRAM_METRO_CATEGORY] * 4,
|
||||
"lat": [51.6034, 51.6037, 53.4826, 53.4826],
|
||||
"lng": [0.0933, 0.0931, -2.3087, -2.3086],
|
||||
"locality": [None, None, None, None],
|
||||
"entrance": [False, True, False, True],
|
||||
"is_lu": [True, False, False, False],
|
||||
}
|
||||
)
|
||||
|
||||
result = deduplicate_naptan(df).sort("category")
|
||||
|
||||
assert len(result) == 2
|
||||
assert result["category"].to_list() == [
|
||||
TRAM_METRO_CATEGORY,
|
||||
TUBE_STATION_CATEGORY,
|
||||
]
|
||||
tube = result.filter(pl.col("category") == TUBE_STATION_CATEGORY)
|
||||
# The station node (not the entrance) represents the merged POI.
|
||||
assert tube["id"][0] == "940GZZLUHLT"
|
||||
tram = result.filter(pl.col("category") == TRAM_METRO_CATEGORY)
|
||||
assert tram["id"][0] == "9400ZZMAWST"
|
||||
|
||||
|
||||
def test_deduplicate_naptan_merges_bus_station_bays_and_entrances():
|
||||
# BCS bays and a BCE entrance of one bus station collapse to a single POI
|
||||
# represented by a non-entrance node; a different bus station in another
|
||||
# area survives separately.
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": ["bay-1", "bay-2", "ent-1", "other"],
|
||||
"name": [
|
||||
"Bury Interchange",
|
||||
"Bury Interchange",
|
||||
"Bury Interchange East Entrance",
|
||||
"Rochdale Interchange",
|
||||
],
|
||||
"category": ["Bus station"] * 4,
|
||||
"lat": [53.5907, 53.5908, 53.5909, 53.6160],
|
||||
"lng": [-2.2958, -2.2957, -2.2956, -2.1561],
|
||||
"locality": ["BURY", "BURY", "BURY", "ROCHDALE"],
|
||||
"entrance": [False, False, True, False],
|
||||
}
|
||||
)
|
||||
|
||||
result = deduplicate_naptan(df).sort("name")
|
||||
|
||||
assert result["name"].to_list() == ["Bury Interchange", "Rochdale Interchange"]
|
||||
assert result.filter(pl.col("name") == "Bury Interchange")["id"][0] == "bay-1"
|
||||
|
||||
|
||||
def test_deduplicate_naptan_merges_tube_station_variants_by_area():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
|
|||
"Bank",
|
||||
],
|
||||
"category": [
|
||||
"Tube station",
|
||||
"Tram & Metro stop",
|
||||
"Tube station",
|
||||
"Rail station",
|
||||
"Bus stop",
|
||||
|
|
|
|||
|
|
@ -1,11 +1,15 @@
|
|||
"""Tests for transit_network GTFS processing."""
|
||||
|
||||
import datetime as dt
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pipeline.download.transit_network import convert_high_freq_to_frequency_based
|
||||
from pipeline.download.transit_network import (
|
||||
convert_high_freq_to_frequency_based,
|
||||
validate_gtfs_feed,
|
||||
)
|
||||
|
||||
|
||||
def _write_gtfs(path: Path, *, stop_times: str) -> None:
|
||||
|
|
@ -77,3 +81,162 @@ def test_raises_when_no_first_stops_found(tmp_path: Path) -> None:
|
|||
|
||||
with pytest.raises(RuntimeError, match="no first stops"):
|
||||
convert_high_freq_to_frequency_based(src, dst)
|
||||
|
||||
|
||||
# ── validate_gtfs_feed ────────────────────────────────────────────────────────
|
||||
|
||||
TODAY = dt.date(2026, 6, 10)
|
||||
|
||||
|
||||
def _make_gtfs(
|
||||
path: Path,
|
||||
*,
|
||||
calendar: str | None = (
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20260101,20271231\n"
|
||||
),
|
||||
calendar_dates: str | None = None,
|
||||
stops: str = (
|
||||
"stop_id,stop_name,stop_lat,stop_lon\n"
|
||||
"STOP_A,Bank,51.5133,-0.0886\n"
|
||||
"STOP_B,Liverpool Street,51.5178,-0.0823\n"
|
||||
),
|
||||
routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n",
|
||||
trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n",
|
||||
stop_times: str = (
|
||||
"trip_id,stop_sequence,departure_time,stop_id\n"
|
||||
"T1,0,06:00:00,STOP_A\n"
|
||||
"T1,1,06:02:00,STOP_B\n"
|
||||
),
|
||||
) -> Path:
|
||||
"""Write a tiny synthetic GTFS zip; defaults form a valid current feed."""
|
||||
with zipfile.ZipFile(path, "w") as z:
|
||||
if calendar is not None:
|
||||
z.writestr("calendar.txt", calendar)
|
||||
if calendar_dates is not None:
|
||||
z.writestr("calendar_dates.txt", calendar_dates)
|
||||
z.writestr("stops.txt", stops)
|
||||
z.writestr("routes.txt", routes)
|
||||
z.writestr("trips.txt", trips)
|
||||
z.writestr("stop_times.txt", stop_times)
|
||||
return path
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(tmp_path / "feed.zip")
|
||||
validate_gtfs_feed(feed, "test feed", today=TODAY) # must not raise
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None:
|
||||
"""The 2010 TfL snapshot failure mode: all calendars ended years ago."""
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=(
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"):
|
||||
validate_gtfs_feed(feed, "stale tfl", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_calendar_starting_after_window_fails(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=(
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20270101,20271231\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="no service active"):
|
||||
validate_gtfs_feed(feed, "future feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
"""An expired calendar.txt passes if calendar_dates.txt adds service now."""
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=(
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
|
||||
),
|
||||
calendar_dates="service_id,date,exception_type\nS1,20260615,1\n",
|
||||
)
|
||||
validate_gtfs_feed(feed, "rescued feed", today=TODAY) # must not raise
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_removed_service_exception_does_not_count(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=None,
|
||||
calendar_dates="service_id,date,exception_type\nS1,20260615,2\n",
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="no service active"):
|
||||
validate_gtfs_feed(feed, "removed-only feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None:
|
||||
"""The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords."""
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
stops=(
|
||||
"stop_id,stop_name,stop_lat,stop_lon\n"
|
||||
"STOP_A,Nowhere,0,0\n"
|
||||
"STOP_B,Blank,,\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match=r"plausible UK coordinates"):
|
||||
validate_gtfs_feed(feed, "coordless feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
stops=(
|
||||
"stop_id,stop_name,stop_lat,stop_lon\n"
|
||||
"STOP_A,New York,40.71,-74.0\n"
|
||||
"STOP_B,Sydney,-33.87,151.21\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="plausible UK coordinates"):
|
||||
validate_gtfs_feed(feed, "abroad feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None:
|
||||
"""One bad stop out of 30 (3.3%) stays under the 5% tolerance."""
|
||||
rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)]
|
||||
rows.append("STOP_BAD,Broken,0,0\n")
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows),
|
||||
)
|
||||
validate_gtfs_feed(feed, "mostly good feed", today=TODAY) # must not raise
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n")
|
||||
with pytest.raises(RuntimeError, match="trips.txt has no data rows"):
|
||||
validate_gtfs_feed(feed, "tripless feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(tmp_path / "feed.zip", calendar=None)
|
||||
with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"):
|
||||
validate_gtfs_feed(feed, "calendarless feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None:
|
||||
bogus = tmp_path / "feed.zip"
|
||||
bogus.write_text("not a zip")
|
||||
with pytest.raises(RuntimeError, match="not a valid zip"):
|
||||
validate_gtfs_feed(bogus, "bogus feed", today=TODAY)
|
||||
|
|
|
|||
|
|
@ -2,24 +2,32 @@
|
|||
|
||||
Downloads:
|
||||
- England OSM PBF from Geofabrik (~1.5GB)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
|
||||
- TfL TransXChange timetables → converted to GTFS
|
||||
- National Rail CIF timetable → converted to GTFS (requires credentials)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
|
||||
plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
|
||||
- National Rail CIF timetable → converted to GTFS (requires credentials;
|
||||
includes the Elizabeth line, TOC "XR")
|
||||
|
||||
Then processes for R5 compatibility:
|
||||
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Converts high-frequency metro/tram services to frequency-based GTFS
|
||||
- Converts TfL TransXChange to GTFS via transxchange2gtfs
|
||||
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
|
||||
- Validates every produced GTFS zip (active calendar window, plausible UK
|
||||
stop coordinates, non-empty routes/trips/stop_times)
|
||||
|
||||
Requires: osmium-tool, Node.js (npx), Docker (for national rail)
|
||||
Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
|
||||
was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
|
||||
in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
|
||||
service. BODS covers all TfL modes that feed nominally provided.
|
||||
|
||||
Requires: osmium-tool, Docker (for national rail)
|
||||
|
||||
Output directory: property-data/transit/
|
||||
raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
|
||||
raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import datetime as dt
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
|
|
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
|
|||
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
|
||||
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
|
||||
|
||||
# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
|
||||
TFL_TRANSXCHANGE_URL = (
|
||||
"https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
|
||||
)
|
||||
|
||||
# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
|
||||
NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
||||
|
||||
# National Rail Open Data API
|
||||
NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
|
||||
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
|
||||
|
||||
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
|
||||
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
|
||||
|
||||
# GTFS validation: a feed must have service within this many days of the build
|
||||
# date, and at least this fraction of stops must have plausible UK coordinates.
|
||||
GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
|
||||
GTFS_MIN_VALID_STOP_FRACTION = 0.95
|
||||
UK_LAT_RANGE = (49.0, 61.0)
|
||||
UK_LON_RANGE = (-9.0, 2.5)
|
||||
|
||||
|
||||
def _download_http(
|
||||
|
|
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
|
|||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def download_tfl_transxchange(raw_dir: Path) -> Path:
|
||||
"""Download TfL TransXChange timetable bundle."""
|
||||
dest = raw_dir / "tfl_transxchange.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL TransXChange already exists: {dest}")
|
||||
return dest
|
||||
|
||||
print("Downloading TfL TransXChange timetables...")
|
||||
_download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
|
||||
return dest
|
||||
def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
|
||||
"""True if a GTFS file has at least one non-empty data row after the header."""
|
||||
with z.open(filename) as f:
|
||||
f.readline() # header
|
||||
for line in f:
|
||||
if _parse_csv_line(line):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def download_naptan() -> None:
|
||||
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
|
||||
dest = local_tmp_dir() / "Stops.csv"
|
||||
if dest.exists():
|
||||
print(f"NaPTAN Stops.csv already exists: {dest}")
|
||||
return
|
||||
def _calendar_active_in_window(
|
||||
z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
|
||||
) -> bool:
|
||||
"""True if calendar.txt/calendar_dates.txt have service in [start, end].
|
||||
|
||||
print("Downloading NaPTAN stops data...")
|
||||
_download_http(NAPTAN_URL, dest, desc="Stops.csv")
|
||||
|
||||
|
||||
def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
||||
"""Convert TfL TransXChange to GTFS using transxchange2gtfs."""
|
||||
dest = output_dir / "tfl_gtfs.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL GTFS already exists: {dest}")
|
||||
return dest
|
||||
|
||||
txc_path = raw_dir / "tfl_transxchange.zip"
|
||||
|
||||
# Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
|
||||
download_naptan()
|
||||
|
||||
print("Converting TfL TransXChange → GTFS...")
|
||||
# The shim patches known packaging/runtime issues in the pinned npm package
|
||||
# before loading its CLI from npx's temporary install.
|
||||
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
|
||||
subprocess.run(
|
||||
[
|
||||
"npx",
|
||||
"--yes",
|
||||
"--package",
|
||||
TRANSXCHANGE2GTFS_PACKAGE,
|
||||
"sh",
|
||||
"-c",
|
||||
"\n".join(
|
||||
[
|
||||
'bin="$(command -v transxchange2gtfs)"',
|
||||
'script="$(readlink -f "$bin")"',
|
||||
'pkg_dir="$(dirname "$(dirname "$script")")"',
|
||||
'shim="$1"',
|
||||
"shift",
|
||||
'exec node "$shim" "$pkg_dir" "$@"',
|
||||
]
|
||||
),
|
||||
"transxchange2gtfs",
|
||||
str(shim_path.resolve()),
|
||||
str(txc_path.resolve()),
|
||||
str(dest.resolve()),
|
||||
],
|
||||
check=True,
|
||||
Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
|
||||
date range overlaps the window AND at least one weekday flag is set; a
|
||||
calendar_dates.txt row counts when it adds service (exception_type=1) on a
|
||||
date inside the window.
|
||||
"""
|
||||
weekdays = (
|
||||
"monday",
|
||||
"tuesday",
|
||||
"wednesday",
|
||||
"thursday",
|
||||
"friday",
|
||||
"saturday",
|
||||
"sunday",
|
||||
)
|
||||
if "calendar.txt" in names:
|
||||
with z.open("calendar.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
start_idx = cols.index("start_date")
|
||||
end_idx = cols.index("end_date")
|
||||
except ValueError:
|
||||
return False
|
||||
day_idxs = [cols.index(d) for d in weekdays if d in cols]
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
try:
|
||||
start = int(parts[start_idx].strip('"'))
|
||||
end = int(parts[end_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if start > window_end or end < window_start:
|
||||
continue
|
||||
if day_idxs and not any(
|
||||
parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
|
||||
):
|
||||
continue
|
||||
return True
|
||||
|
||||
if "calendar_dates.txt" in names:
|
||||
with z.open("calendar_dates.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
date_idx = cols.index("date")
|
||||
exc_idx = cols.index("exception_type")
|
||||
except ValueError:
|
||||
return False
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
try:
|
||||
date = int(parts[date_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
|
||||
continue
|
||||
if window_start <= date <= window_end:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
|
||||
"""Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
|
||||
|
||||
Guards against silently shipping a feed that contributes zero service (as
|
||||
the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
|
||||
(a) calendar.txt/calendar_dates.txt have at least one service active
|
||||
within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
|
||||
(b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
|
||||
have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
|
||||
(c) routes.txt, trips.txt and stop_times.txt each have data rows.
|
||||
"""
|
||||
if today is None:
|
||||
today = dt.date.today()
|
||||
window_start = int(today.strftime("%Y%m%d"))
|
||||
window_end = int(
|
||||
(today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
|
||||
)
|
||||
|
||||
def fail(reason: str) -> None:
|
||||
raise RuntimeError(
|
||||
f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
|
||||
)
|
||||
|
||||
print(f"Validating GTFS feed '{feed_name}'...")
|
||||
if not path.exists() or not zipfile.is_zipfile(path):
|
||||
fail("not a valid zip file")
|
||||
|
||||
with zipfile.ZipFile(path) as z:
|
||||
names = set(z.namelist())
|
||||
|
||||
# (c) core files present and non-empty
|
||||
for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
|
||||
if required not in names:
|
||||
fail(f"missing {required}")
|
||||
if not _gtfs_has_data_row(z, required):
|
||||
fail(f"{required} has no data rows")
|
||||
|
||||
# (a) at least one service active in the routing window
|
||||
if "calendar.txt" not in names and "calendar_dates.txt" not in names:
|
||||
fail("has neither calendar.txt nor calendar_dates.txt")
|
||||
if not _calendar_active_in_window(z, names, window_start, window_end):
|
||||
fail(
|
||||
f"no service active between {window_start} and {window_end} — "
|
||||
"the feed's calendars are stale/expired and it would contribute "
|
||||
"zero service to routing"
|
||||
)
|
||||
|
||||
# (b) stops have plausible UK coordinates
|
||||
total_stops = 0
|
||||
valid_stops = 0
|
||||
with z.open("stops.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
lat_idx = cols.index("stop_lat")
|
||||
lon_idx = cols.index("stop_lon")
|
||||
except ValueError:
|
||||
fail("stops.txt is missing stop_lat/stop_lon columns")
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
total_stops += 1
|
||||
try:
|
||||
lat = float(parts[lat_idx].strip('"'))
|
||||
lon = float(parts[lon_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue # empty/garbage coordinate → invalid
|
||||
if lat == 0.0 and lon == 0.0:
|
||||
continue
|
||||
if (
|
||||
UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
|
||||
and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
|
||||
):
|
||||
valid_stops += 1
|
||||
if total_stops == 0:
|
||||
fail("stops.txt has no stops")
|
||||
fraction = valid_stops / total_stops
|
||||
if fraction < GTFS_MIN_VALID_STOP_FRACTION:
|
||||
fail(
|
||||
f"only {valid_stops}/{total_stops} stops "
|
||||
f"({fraction:.1%}) have plausible UK coordinates "
|
||||
f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
|
||||
f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
|
||||
f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
|
||||
)
|
||||
|
||||
print(
|
||||
f" OK: service active in window, {valid_stops}/{total_stops} stops "
|
||||
f"({fraction:.1%}) with plausible UK coordinates"
|
||||
)
|
||||
required_files = {
|
||||
"agency.txt",
|
||||
"calendar.txt",
|
||||
"calendar_dates.txt",
|
||||
"routes.txt",
|
||||
"stop_times.txt",
|
||||
"stops.txt",
|
||||
"trips.txt",
|
||||
}
|
||||
if not dest.exists() or not zipfile.is_zipfile(dest):
|
||||
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
|
||||
with zipfile.ZipFile(dest) as z:
|
||||
missing = required_files - set(z.namelist())
|
||||
if missing:
|
||||
missing_str = ", ".join(sorted(missing))
|
||||
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
|
||||
size_mb = dest.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dest} ({size_mb:.1f} MB)")
|
||||
return dest
|
||||
|
||||
|
||||
def download_national_rail_cif(raw_dir: Path) -> Path | None:
|
||||
|
|
@ -1007,18 +1099,15 @@ def main() -> None:
|
|||
required=True,
|
||||
help="Output directory for transit data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-tfl",
|
||||
action="store_true",
|
||||
help="Skip TfL TransXChange download and conversion",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir: Path = args.output
|
||||
raw_dir = output_dir / "raw"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
|
||||
# England bus/tram/ferry plus London Underground, DLR, London Tramlink and
|
||||
# the IFS Cloud Cable Car, so no separate TfL feed is needed.
|
||||
download_osm_pbf(raw_dir)
|
||||
bods_raw = download_bods_gtfs(raw_dir)
|
||||
|
||||
|
|
@ -1027,16 +1116,10 @@ def main() -> None:
|
|||
|
||||
bods_final = output_dir / "bods_gtfs.zip"
|
||||
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
|
||||
validate_gtfs_feed(bods_final, "BODS GTFS")
|
||||
|
||||
# 2. TfL TransXChange → GTFS
|
||||
if args.skip_tfl:
|
||||
print("Skipping TfL (--skip-tfl)")
|
||||
else:
|
||||
download_tfl_transxchange(raw_dir)
|
||||
convert_tfl_to_gtfs(raw_dir, output_dir)
|
||||
|
||||
# 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
|
||||
# reach the ~2,725 railway-station destinations, so a bus/TfL-only network
|
||||
# 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
|
||||
# reach the ~2,725 railway-station destinations, so a bus/metro-only network
|
||||
# silently overstates every train commute. Missing credentials are a HARD
|
||||
# error, so a rail-less network can never ship.
|
||||
cif = download_national_rail_cif(raw_dir)
|
||||
|
|
@ -1048,7 +1131,8 @@ def main() -> None:
|
|||
"required; without it the transit network models every train journey "
|
||||
"as bus-only and overstates commute times."
|
||||
)
|
||||
convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
validate_gtfs_feed(nr_final, "National Rail GTFS")
|
||||
|
||||
# Summary
|
||||
print()
|
||||
|
|
|
|||
|
|
@ -1,106 +0,0 @@
|
|||
#!/usr/bin/env node
|
||||
"use strict";
|
||||
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const { createRequire } = require("module");
|
||||
|
||||
const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
|
||||
|
||||
if (!pkgDirArg || converterArgs.length < 2) {
|
||||
console.error(
|
||||
"Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
|
||||
);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const pkgDir = path.resolve(pkgDirArg);
|
||||
const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
|
||||
const localTmpDir =
|
||||
process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
|
||||
const stopsCsv = path.join(localTmpDir, "Stops.csv");
|
||||
const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
|
||||
const converterTmpPatch =
|
||||
`static TMP = ${JSON.stringify(converterTmpPrefix)}` +
|
||||
` + process.pid + ${JSON.stringify(path.sep)};`;
|
||||
|
||||
fs.mkdirSync(localTmpDir, { recursive: true });
|
||||
|
||||
function replaceOnce(relativePath, before, after) {
|
||||
const file = path.join(pkgDir, relativePath);
|
||||
const original = fs.readFileSync(file, "utf8");
|
||||
if (original.includes(before)) {
|
||||
fs.writeFileSync(file, original.replace(before, after));
|
||||
} else if (original.includes(after)) {
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`Could not patch ${relativePath}: expected text not found`);
|
||||
}
|
||||
}
|
||||
|
||||
// The published 1.12.0 package has a few compatibility issues with current
|
||||
// TfL TransXChange exports:
|
||||
// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
|
||||
// - the compiled date-holidays import expects a synthetic default export
|
||||
// - some TfL journeys reference timing links without matching route-link geometry
|
||||
//
|
||||
// GTFS shapes are optional for R5 routing. Clear shape references and omit
|
||||
// shapes.txt so missing route geometry does not drop otherwise usable trips.
|
||||
function patchPackage() {
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
"static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
|
||||
converterTmpPatch,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
'fs.existsSync("/tmp/Stops.csv")',
|
||||
`fs.existsSync(${JSON.stringify(stopsCsv)})`,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
'fs.createReadStream("/tmp/Stops.csv", "utf8")',
|
||||
`fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/converter/GetStopData.js",
|
||||
'fs.createWriteStream("/tmp/Stops.csv")',
|
||||
`fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/transxchange/TransXChangeJourneyStream.js",
|
||||
"distanceSoFarM += routeLink.Distance;",
|
||||
"distanceSoFarM += routeLink ? routeLink.Distance : 0;",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/gtfs/TripsStream.js",
|
||||
"(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
|
||||
"\"\");",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/gtfs/StopTimesStream.js",
|
||||
"stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
|
||||
"\"\", stop.exactTime ? \"1\" : \"0\");",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
|
||||
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
|
||||
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
|
||||
);
|
||||
}
|
||||
|
||||
patchPackage();
|
||||
|
||||
const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
|
||||
const Holidays = pkgRequire("date-holidays");
|
||||
if (!Holidays.default) {
|
||||
Holidays.default = Holidays;
|
||||
}
|
||||
|
||||
process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
|
||||
require(path.join(pkgDir, "dist", "cli.js"));
|
||||
|
|
@ -109,6 +109,27 @@ def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
|
|||
return _clean_string(column).cast(dtype, strict=False)
|
||||
|
||||
|
||||
def _join_address_parts(*columns: str) -> pl.Expr:
|
||||
"""Join address components into one display address, single-spaced.
|
||||
|
||||
Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent —
|
||||
saon is "" on ~88% of rows — and ``concat_str(..., ignore_nulls=True)``
|
||||
skips only nulls, so empty components still contributed their separator
|
||||
(``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
|
||||
Convert ``''``→null per component so ignore_nulls works as intended, then
|
||||
defensively collapse residual whitespace runs and strip the result. A
|
||||
fully-empty address becomes null (dropped by the downstream
|
||||
``pp_address.is_not_null()`` filter) instead of whitespace junk.
|
||||
"""
|
||||
joined = pl.concat_str(
|
||||
[_clean_string(column) for column in columns],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
)
|
||||
cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
|
||||
return pl.when(cleaned == "").then(None).otherwise(cleaned)
|
||||
|
||||
|
||||
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
||||
return (
|
||||
raw.select(
|
||||
|
|
@ -436,11 +457,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
)
|
||||
.filter(pl.col("pp_property_type") != "Other")
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
).alias("pp_address"),
|
||||
_join_address_parts("saon", "paon", "street").alias("pp_address"),
|
||||
)
|
||||
.with_columns(
|
||||
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
|
||||
|
|
|
|||
|
|
@ -102,15 +102,11 @@ _AREA_COLUMNS = [
|
|||
# is postcode-grain: it belongs in the area output (one value per postcode,
|
||||
# covering property-less postcodes too) rather than duplicated per property.
|
||||
TREE_DENSITY_FEATURE,
|
||||
# Schools
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
"Good+ primary schools within 2km",
|
||||
"Good+ secondary schools within 2km",
|
||||
"Outstanding primary schools within 5km",
|
||||
"Outstanding secondary schools within 5km",
|
||||
"Outstanding primary schools within 2km",
|
||||
"Outstanding secondary schools within 2km",
|
||||
# Schools (modelled historical catchment areas covering the postcode)
|
||||
"Good+ primary school catchments",
|
||||
"Good+ secondary school catchments",
|
||||
"Outstanding primary school catchments",
|
||||
"Outstanding secondary school catchments",
|
||||
# Demographics
|
||||
"Median age",
|
||||
# Politics
|
||||
|
|
@ -172,14 +168,10 @@ _FINAL_RENAME_COLUMNS = {
|
|||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
"good_primary_5km": "Good+ primary schools within 5km",
|
||||
"good_secondary_5km": "Good+ secondary schools within 5km",
|
||||
"good_primary_2km": "Good+ primary schools within 2km",
|
||||
"good_secondary_2km": "Good+ secondary schools within 2km",
|
||||
"outstanding_primary_5km": "Outstanding primary schools within 5km",
|
||||
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
|
||||
"outstanding_primary_2km": "Outstanding primary schools within 2km",
|
||||
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
|
||||
"good_primary_catchments": "Good+ primary school catchments",
|
||||
"good_secondary_catchments": "Good+ secondary school catchments",
|
||||
"outstanding_primary_catchments": "Outstanding primary school catchments",
|
||||
"outstanding_secondary_catchments": "Outstanding secondary school catchments",
|
||||
"max_download_speed": "Max available download speed (Mbps)",
|
||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||
|
|
@ -874,7 +866,7 @@ def _join_area_side_tables(
|
|||
election: pl.LazyFrame,
|
||||
poi_counts: pl.LazyFrame,
|
||||
noise: pl.LazyFrame,
|
||||
school_proximity: pl.LazyFrame,
|
||||
school_catchments: pl.LazyFrame,
|
||||
conservation_areas: pl.LazyFrame,
|
||||
tree_density: pl.LazyFrame | None,
|
||||
broadband: pl.LazyFrame,
|
||||
|
|
@ -905,7 +897,7 @@ def _join_area_side_tables(
|
|||
base = base.join(election, on="pcon", how="left")
|
||||
base = base.join(poi_counts, on="postcode", how="left")
|
||||
base = base.join(noise, on="postcode", how="left")
|
||||
base = base.join(school_proximity, on="postcode", how="left")
|
||||
base = base.join(school_catchments, on="postcode", how="left")
|
||||
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
|
||||
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
|
||||
)
|
||||
|
|
@ -1970,7 +1962,7 @@ def _build(
|
|||
ethnicity_path: Path,
|
||||
crime_path: Path,
|
||||
noise_path: Path,
|
||||
school_proximity_path: Path,
|
||||
school_catchments_path: Path,
|
||||
broadband_path: Path,
|
||||
conservation_areas_path: Path,
|
||||
rental_prices_path: Path,
|
||||
|
|
@ -2080,7 +2072,7 @@ def _build(
|
|||
)
|
||||
.select("postcode", "noise_lden_db")
|
||||
)
|
||||
school_proximity = pl.scan_parquet(school_proximity_path)
|
||||
school_catchments = pl.scan_parquet(school_catchments_path)
|
||||
conservation_areas = _conservation_area_by_postcode(
|
||||
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
|
||||
)
|
||||
|
|
@ -2120,7 +2112,7 @@ def _build(
|
|||
"election": election,
|
||||
"poi_counts": poi_counts,
|
||||
"noise": noise,
|
||||
"school_proximity": school_proximity,
|
||||
"school_catchments": school_catchments,
|
||||
"conservation_areas": conservation_areas,
|
||||
"tree_density": tree_density,
|
||||
"broadband": broadband,
|
||||
|
|
@ -2267,10 +2259,10 @@ def main():
|
|||
"--noise", type=Path, required=True, help="Road noise by postcode parquet file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--school-proximity",
|
||||
"--school-catchments",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="School proximity counts parquet file",
|
||||
help="School catchment counts parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--broadband",
|
||||
|
|
@ -2376,7 +2368,7 @@ def main():
|
|||
ethnicity_path=args.ethnicity,
|
||||
crime_path=args.crime,
|
||||
noise_path=args.noise,
|
||||
school_proximity_path=args.school_proximity,
|
||||
school_catchments_path=args.school_catchments,
|
||||
broadband_path=args.broadband,
|
||||
conservation_areas_path=args.conservation_areas,
|
||||
rental_prices_path=args.rental_prices,
|
||||
|
|
|
|||
|
|
@ -25,11 +25,30 @@ POI_GROUPS_2KM = {
|
|||
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
|
||||
GROCERIES_GROUP = "Groceries"
|
||||
|
||||
# Groceries categories EXCLUDED from the static "Number of grocery shops and
|
||||
# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
|
||||
# are speciality food retail, not somewhere you do a grocery shop; together
|
||||
# they were ~a third of the group and inflated the headline count. The metric
|
||||
# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
|
||||
GROCERY_STATIC_EXCLUDED_CATEGORIES = {
|
||||
"Bakery",
|
||||
"Butcher & Fishmonger",
|
||||
"Deli & Specialty",
|
||||
"Off-Licence",
|
||||
}
|
||||
|
||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
||||
# of green spaces that are only mapped as polygons in OSM.
|
||||
# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
|
||||
# (open public recreation grounds) is borderline but kept: outside big cities
|
||||
# the local rec ground is the de facto park. "Play Space" (playgrounds) is
|
||||
# excluded — a playground is not a park, and "Playground" is already its own
|
||||
# OSM-derived category. The remaining functions (Religious Grounds, Golf
|
||||
# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
|
||||
# Facility) are clearly not parks.
|
||||
GREENSPACE_PARK_FUNCTIONS = {
|
||||
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
|
||||
"parks": ["Public Park Or Garden", "Playing Field"],
|
||||
}
|
||||
|
||||
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
|
||||
|
|
@ -50,17 +69,22 @@ def _poi_category_slug(category: str) -> str:
|
|||
|
||||
|
||||
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
|
||||
"""Return the distinct `category` values for the Groceries group.
|
||||
"""Return the distinct `category` values for the static groceries metric.
|
||||
|
||||
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
|
||||
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
|
||||
with group "Groceries"; it never emits the literal "Supermarket". Collecting
|
||||
every Groceries category captures both the OSM strings and the brand names.
|
||||
Speciality food retail (bakeries, butchers, delis, off-licences) is
|
||||
excluded — see GROCERY_STATIC_EXCLUDED_CATEGORIES.
|
||||
"""
|
||||
if "group" not in pois.columns:
|
||||
raise ValueError("POI dataframe must include a 'group' column")
|
||||
return (
|
||||
pois.filter(pl.col("group") == GROCERIES_GROUP)
|
||||
pois.filter(
|
||||
(pl.col("group") == GROCERIES_GROUP)
|
||||
& ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
|
||||
)
|
||||
.select("category")
|
||||
.unique()
|
||||
.sort("category")
|
||||
|
|
@ -109,6 +133,40 @@ def _build_poi_category_groups(
|
|||
return groups, display_names
|
||||
|
||||
|
||||
def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Collapse the greenspace frame to ONE representative row per site.
|
||||
|
||||
os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
|
||||
right grain for nearest-distance (the nearest gate is what matters) but
|
||||
wildly over-counts "Number of amenities (Park) within Xkm" — a large park
|
||||
with 30 gates counted as 30 parks. Counting uses one row per site at the
|
||||
site centroid (falling back to the first access point when no centroid is
|
||||
available). Degrades gracefully: a legacy parquet without `site_id` is
|
||||
returned unchanged (gate-grain counts) rather than crashing.
|
||||
"""
|
||||
if "site_id" not in greenspace.columns:
|
||||
print(
|
||||
"WARNING: greenspace parquet has no site_id column; park counts "
|
||||
"will count access points, not sites (regenerate os_greenspace)"
|
||||
)
|
||||
return greenspace
|
||||
|
||||
keyed = greenspace.filter(pl.col("site_id").is_not_null())
|
||||
unkeyed = greenspace.filter(pl.col("site_id").is_null())
|
||||
|
||||
representatives = keyed.unique(subset=["site_id"], keep="first")
|
||||
if {"site_lat", "site_lng"}.issubset(greenspace.columns):
|
||||
representatives = representatives.with_columns(
|
||||
pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
|
||||
pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
|
||||
)
|
||||
|
||||
frames = [representatives.select(greenspace.columns)]
|
||||
if len(unkeyed) > 0:
|
||||
frames.append(unkeyed)
|
||||
return pl.concat(frames)
|
||||
|
||||
|
||||
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
|
||||
renames: dict[str, str] = {}
|
||||
for group_key, category in display_names.items():
|
||||
|
|
@ -185,13 +243,16 @@ def main():
|
|||
|
||||
# Park counts and distances from OS Open Greenspace. They use the dynamic
|
||||
# amenity metric names so filters read through the same side-table path as
|
||||
# OSM-derived amenity metrics.
|
||||
# OSM-derived amenity metrics. Distances use the access-point grain (the
|
||||
# nearest park GATE is the right semantics); counts use one row per SITE so
|
||||
# a park with many gates counts once.
|
||||
greenspace = pl.read_parquet(args.greenspace)
|
||||
greenspace_sites = _greenspace_count_frame(greenspace)
|
||||
park_counts_2km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
|
||||
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
|
||||
)
|
||||
park_counts_5km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
|
||||
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
|
||||
)
|
||||
park_distances = min_distance_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
|
||||
|
|
|
|||
|
|
@ -260,6 +260,12 @@ def main() -> None:
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.greenspace and not args.greenspace.exists():
|
||||
# Fail loudly and EARLY (before the ~10h Phases 1-3): silently skipping
|
||||
# the subtraction is exactly how parks/lakes shipped inside postcode
|
||||
# boundaries unnoticed.
|
||||
raise SystemExit(f"--greenspace file not found: {args.greenspace}")
|
||||
|
||||
fragments_cache = args.output / "fragments_cache.parquet"
|
||||
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
|
||||
# so a greenspace change must not invalidate the fragment cache.
|
||||
|
|
@ -294,7 +300,7 @@ def main() -> None:
|
|||
|
||||
greenspace_tree = None
|
||||
greenspace_geoms = None
|
||||
if args.greenspace and args.greenspace.exists():
|
||||
if args.greenspace:
|
||||
from .greenspace import load_greenspace
|
||||
|
||||
print(f" Loading greenspace/water from {args.greenspace}...")
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from shapely import wkb
|
||||
from shapely import make_valid, wkb
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
|
|
@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
|
|||
def load_greenspace(path: Path) -> tuple[STRtree, list]:
|
||||
"""Load greenspace parquet and build an STRtree spatial index.
|
||||
|
||||
Geometries are repaired with ``make_valid`` on load: an invalid park/lake
|
||||
polygon would make the per-postcode ``intersects`` predicate (and the exact
|
||||
difference path) liable to raise mid-merge, hours into a build. Empty
|
||||
geometries are dropped.
|
||||
|
||||
Returns:
|
||||
(tree, geoms) where tree is a Shapely STRtree and geoms is
|
||||
the list of geometries indexed by the tree.
|
||||
"""
|
||||
df = pl.read_parquet(path)
|
||||
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
|
||||
geoms = []
|
||||
for raw in df["geometry"].to_list():
|
||||
geom = wkb.loads(raw)
|
||||
if not geom.is_valid:
|
||||
geom = make_valid(geom)
|
||||
if not geom.is_empty:
|
||||
geoms.append(geom)
|
||||
tree = STRtree(geoms)
|
||||
return tree, geoms
|
||||
|
||||
|
|
|
|||
|
|
@ -101,6 +101,21 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
|||
return geojson_dict
|
||||
|
||||
|
||||
def _is_pointlike(geom_bng) -> bool:
|
||||
"""True if a BNG geometry carries no real extent (tower-block signature).
|
||||
|
||||
Near-zero area AND short perimeter together distinguish a collapsed point
|
||||
from a genuine thin sliver, which still carries length.
|
||||
"""
|
||||
try:
|
||||
return (
|
||||
geom_bng.area < _POINTLIKE_AREA_M2
|
||||
and geom_bng.length < _POINTLIKE_PERIMETER_M
|
||||
)
|
||||
except GEOSException:
|
||||
return False
|
||||
|
||||
|
||||
def _rescue_footprint(geom_bng) -> dict | None:
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
|
||||
|
||||
|
|
@ -109,15 +124,9 @@ def _rescue_footprint(geom_bng) -> dict | None:
|
|||
gets a building-scale buffer so it is not reduced to an invisible sub-metre
|
||||
dot; thin slivers that still carry length keep the minimal buffer.
|
||||
"""
|
||||
buffer_m = _MIN_FOOTPRINT_BUFFER_M
|
||||
try:
|
||||
if (
|
||||
geom_bng.area < _POINTLIKE_AREA_M2
|
||||
and geom_bng.length < _POINTLIKE_PERIMETER_M
|
||||
):
|
||||
buffer_m = _POINT_RESCUE_BUFFER_M
|
||||
except GEOSException:
|
||||
pass
|
||||
buffer_m = (
|
||||
_POINT_RESCUE_BUFFER_M if _is_pointlike(geom_bng) else _MIN_FOOTPRINT_BUFFER_M
|
||||
)
|
||||
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
|
||||
if footprint is None:
|
||||
return None
|
||||
|
|
@ -147,10 +156,16 @@ def to_wgs84_geojson(
|
|||
)
|
||||
if simplified is None:
|
||||
simplified = cleaned
|
||||
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
||||
result = _snap_to_wgs84_geojson(simplified)
|
||||
if result is None:
|
||||
if _is_pointlike(simplified):
|
||||
# A POINTLIKE footprint is rescued to building scale even when it
|
||||
# would survive snapping: a 0.1-1 m² polygon serializes fine but
|
||||
# ships as an invisible dot covering a whole tower block.
|
||||
result = _rescue_footprint(simplified)
|
||||
else:
|
||||
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
||||
result = _snap_to_wgs84_geojson(simplified)
|
||||
if result is None:
|
||||
result = _rescue_footprint(simplified)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
|
|
@ -229,6 +244,10 @@ def merge_fragments(
|
|||
greenspace_tree: Optional STRtree of park/water polygons.
|
||||
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
|
||||
"""
|
||||
subtract = greenspace_tree is not None and greenspace_geoms is not None
|
||||
if subtract:
|
||||
from .greenspace import subtract_greenspace
|
||||
|
||||
by_postcode: dict[str, list] = defaultdict(list)
|
||||
for pc, geom in all_fragments:
|
||||
by_postcode[pc].append(geom)
|
||||
|
|
@ -256,9 +275,7 @@ def merge_fragments(
|
|||
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
||||
combined = _fill_holes(combined)
|
||||
# Subtract parks/water if provided
|
||||
if greenspace_tree is not None and greenspace_geoms is not None:
|
||||
from .greenspace import subtract_greenspace
|
||||
|
||||
if subtract:
|
||||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _keep_polygon_parts(combined)
|
||||
|
|
|
|||
|
|
@ -921,6 +921,49 @@ class TestToWgs84Geojson:
|
|||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
|
||||
|
||||
def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
|
||||
"""A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
|
||||
0.86 m²) must NOT ship as-is just because it survives precision snapping;
|
||||
pointlike inputs are rescued to a ~201 m² disc unconditionally."""
|
||||
import pyproj
|
||||
from shapely.geometry import shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
# 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
|
||||
# large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
|
||||
tiny = box(530000, 180000, 530000.9, 180000.9)
|
||||
from .output import _snap_to_wgs84_geojson
|
||||
|
||||
assert _snap_to_wgs84_geojson(tiny) is not None, (
|
||||
"precondition: this polygon must be snappable, otherwise the test "
|
||||
"exercises the old snap-fails path instead of the new one"
|
||||
)
|
||||
result = to_wgs84_geojson(tiny)
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert 150 < area_m2 < 300, (
|
||||
f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
|
||||
"instead of a building-scale (~201 m^2) disc"
|
||||
)
|
||||
|
||||
def test_normal_polygon_area_unchanged(self):
|
||||
"""A normal polygon must pass through without rescue inflation."""
|
||||
import pyproj
|
||||
from shapely.geometry import shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
poly = box(530000, 180000, 530100, 180100) # 10,000 m²
|
||||
result = to_wgs84_geojson(poly)
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 == pytest.approx(10_000, rel=0.01)
|
||||
|
||||
def test_thin_sliver_keeps_minimal_buffer(self):
|
||||
"""A genuine elongated sliver still carries length, so it is NOT inflated
|
||||
to building scale — only truly pointlike inputs are."""
|
||||
|
|
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
|
|||
# 80% < 90% cap, so subtraction should happen
|
||||
assert result.area == pytest.approx(2000, rel=0.01)
|
||||
|
||||
def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
|
||||
"""An invalid (bow-tie) park polygon in the parquet must be repaired on
|
||||
load: it would otherwise make the per-postcode intersects/difference
|
||||
liable to raise hours into a merge."""
|
||||
from .greenspace import load_greenspace
|
||||
|
||||
bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)]) # self-intersects
|
||||
assert not bowtie.is_valid
|
||||
valid = box(20, 20, 30, 30)
|
||||
path = tmp_path / "greenspace.parquet"
|
||||
pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
|
||||
|
||||
tree, geoms = load_greenspace(path)
|
||||
assert len(geoms) == 2
|
||||
assert all(g.is_valid and not g.is_empty for g in geoms)
|
||||
# The repaired bow-tie must still subtract cleanly.
|
||||
result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
|
||||
assert result.is_valid
|
||||
assert result.area < 10_000
|
||||
|
||||
|
||||
class TestToWgs84GeojsonValidity:
|
||||
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ from pipeline.transform.price_estimation.shrinkage import (
|
|||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
LATEST_COMPLETE_YEAR,
|
||||
SMOOTHNESS_SUPPORT_PAIRS,
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
|
|
@ -37,6 +38,19 @@ from pipeline.transform.price_estimation.utils import (
|
|||
|
||||
MIN_PAIRS = 5
|
||||
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
|
||||
# Gap-aware companion to OUTLIER_THRESHOLD: |log_ratio| must also stay within
|
||||
# this many log-units PER YEAR of holding period (short gaps are allowed a
|
||||
# full year's band). A flat +/-3.0 cap admits e.g. a 10k -> 196k "sale" six
|
||||
# months apart (log +2.95, and weight 1/sqrt(gap) gives it the leverage of
|
||||
# ~10 normal pairs); Huber does NOT recover, because once the thin year's
|
||||
# beta satisfies the garbage pair it is the many good long-gap pairs that
|
||||
# carry the residual and get down-weighted. Such pairs are data errors or
|
||||
# non-market transfers (right-to-buy, probate, flips), not house-price
|
||||
# signal -- standard repeat-sales practice (Case-Shiller) excludes extreme
|
||||
# annualised returns for the same reason. 0.7 log/yr (~2x in a year) keeps
|
||||
# any plausible genuine market move; long-gap pairs are still governed by
|
||||
# the +/-3.0 cap.
|
||||
ANNUALISED_OUTLIER_THRESHOLD = 0.7
|
||||
HUBER_K = 1.345
|
||||
IRLS_ITERATIONS = 5
|
||||
|
||||
|
|
@ -111,7 +125,16 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
|
|||
/ (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
|
||||
).alias("weight"),
|
||||
)
|
||||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.filter(
|
||||
pl.col("log_ratio").abs()
|
||||
<= pl.min_horizontal(
|
||||
pl.lit(OUTLIER_THRESHOLD),
|
||||
ANNUALISED_OUTLIER_THRESHOLD
|
||||
* pl.max_horizontal(
|
||||
pl.col("frac_year2") - pl.col("frac_year1"), pl.lit(1.0)
|
||||
),
|
||||
)
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
||||
|
|
@ -181,11 +204,27 @@ def solve_robust_index(
|
|||
# beta=0) has no column, so the penalty spans the non-baseline years only.
|
||||
# For cells with <3 betas there is no curvature to penalise and the solve is
|
||||
# unchanged.
|
||||
#
|
||||
# The penalty is SUPPORT-SCALED per row: a flat lambda is too weak for
|
||||
# years identified by only 1-2 repeat-sale pairs (a cell can have hundreds
|
||||
# of pairs overall yet single thin years, yielding 2-7x one-year spikes
|
||||
# that cell-level shrinkage cannot catch). Each curvature row's lambda is
|
||||
# lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s), with s the minimum
|
||||
# cross-year pair count among the row's three years, so thin years are
|
||||
# pulled strongly toward the local trend while well-supported years keep
|
||||
# the baseline penalty. Taking the min over the triple (not just the
|
||||
# middle year) also covers thin FIRST/LAST years of the range, which only
|
||||
# ever appear at a triple's edge -- the last solved year feeds the
|
||||
# CURRENT_YEAR trend extrapolation, so spikes there are the costliest.
|
||||
n_pen = 0
|
||||
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
|
||||
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
|
||||
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
|
||||
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
|
||||
cross = years1 != years2
|
||||
touched, counts = np.unique(
|
||||
np.concatenate([years1[cross], years2[cross]]), return_counts=True
|
||||
)
|
||||
support = {int(y): int(c) for y, c in zip(touched, counts)}
|
||||
years_sorted = sorted(year_to_col)
|
||||
cols_by_year = [year_to_col[y] for y in years_sorted]
|
||||
n_pen = n_cols - 2
|
||||
|
|
@ -202,6 +241,11 @@ def solve_robust_index(
|
|||
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
|
||||
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
|
||||
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
|
||||
s_k = min(support.get(y, 0) for y in (y0, y1, y2))
|
||||
lam_k = TEMPORAL_SMOOTHNESS_LAMBDA * (
|
||||
1.0 + SMOOTHNESS_SUPPORT_PAIRS / max(s_k, 1)
|
||||
)
|
||||
sqrt_lambda = float(np.sqrt(lam_k))
|
||||
pen_vals[3 * k : 3 * k + 3] = (
|
||||
sqrt_lambda * w0,
|
||||
sqrt_lambda * w1,
|
||||
|
|
@ -347,10 +391,22 @@ def compute_hedonic_index(
|
|||
|
||||
|
||||
EXTRAPOLATION_YEARS = 3
|
||||
# Bound on the per-year slope used to trend-extrapolate beyond the last solved
|
||||
# year (the solve stops at LATEST_COMPLETE_YEAR; CURRENT_YEAR is filled here).
|
||||
# +/-0.10 log/yr (~+/-10.5%/yr) comfortably covers genuine UK sector-level
|
||||
# annual moves while preventing a residual spike in the recent betas from
|
||||
# compounding into an absurd extrapolated step (e.g. +49% in one year).
|
||||
MAX_EXTRAPOLATION_SLOPE = 0.10
|
||||
|
||||
|
||||
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
||||
"""Forward-fill missing years, with linear extrapolation beyond last known year."""
|
||||
"""Forward-fill missing years, with trend extrapolation beyond last known year.
|
||||
|
||||
The extrapolation slope is the MEDIAN of the per-year slopes between
|
||||
consecutive known points in the recent window (a single noisy year corrupts
|
||||
at most one of those slopes, unlike a least-squares fit through all the
|
||||
points), clamped to +/-MAX_EXTRAPOLATION_SLOPE.
|
||||
"""
|
||||
if not index:
|
||||
return {y: 0.0 for y in range(min_year, max_year + 1)}
|
||||
|
||||
|
|
@ -365,7 +421,7 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
|||
last = index[y]
|
||||
filled[y] = last
|
||||
|
||||
# Linear extrapolation beyond last known year
|
||||
# Robust trend extrapolation beyond last known year
|
||||
if last_known_year < max_year:
|
||||
recent = [
|
||||
(y, index[y])
|
||||
|
|
@ -373,9 +429,17 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
|||
if y >= last_known_year - EXTRAPOLATION_YEARS
|
||||
]
|
||||
if len(recent) >= 2:
|
||||
years_arr = np.array([r[0] for r in recent], dtype=np.float64)
|
||||
vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
|
||||
slope = np.polyfit(years_arr, vals_arr, 1)[0]
|
||||
slopes = [
|
||||
(v_b - v_a) / (y_b - y_a)
|
||||
for (y_a, v_a), (y_b, v_b) in zip(recent[:-1], recent[1:])
|
||||
]
|
||||
slope = float(
|
||||
np.clip(
|
||||
np.median(slopes),
|
||||
-MAX_EXTRAPOLATION_SLOPE,
|
||||
MAX_EXTRAPOLATION_SLOPE,
|
||||
)
|
||||
)
|
||||
for y in range(last_known_year + 1, max_year + 1):
|
||||
filled[y] = index[last_known_year] + slope * (y - last_known_year)
|
||||
else:
|
||||
|
|
@ -389,12 +453,16 @@ def build_index(
|
|||
input_path: Path,
|
||||
max_pair_year: int | None = None,
|
||||
postcodes_path: Path | None = None,
|
||||
sectors: list[str] | None = None,
|
||||
) -> pl.DataFrame:
|
||||
"""Build the full price index from raw data.
|
||||
|
||||
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
|
||||
The index is still forward-filled to CURRENT_YEAR.
|
||||
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
|
||||
sectors: if provided, restrict the build to these postcode sectors (for
|
||||
debugging/verification runs; hierarchy levels are then computed only from
|
||||
the scoped pairs, so scoped output is NOT identical to a full build).
|
||||
"""
|
||||
# Solve the index only on COMPLETE calendar years: exclude the partial
|
||||
# current year, whose thin repeat-sale set yields wild betas. The index is
|
||||
|
|
@ -405,6 +473,9 @@ def build_index(
|
|||
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
|
||||
)
|
||||
pairs = extract_pairs(input_path, max_year2=estimation_cap)
|
||||
if sectors is not None:
|
||||
pairs = pairs.filter(pl.col("sector").is_in(sectors))
|
||||
print(f" Scoped to {len(sectors)} sectors: {len(pairs):,} pairs")
|
||||
centroids = extract_centroids(postcodes_path or input_path)
|
||||
|
||||
min_year = int(pairs["year1"].min())
|
||||
|
|
@ -534,9 +605,21 @@ def main():
|
|||
help="Path to postcode.parquet (for lat/lon centroids)",
|
||||
)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
parser.add_argument(
|
||||
"--sectors",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated postcode sectors to scope the build to "
|
||||
"(debug/verification only; hierarchy is computed from scoped pairs)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
result = build_index(args.input, postcodes_path=args.postcodes)
|
||||
sectors = (
|
||||
[s.strip() for s in args.sectors.split(",") if s.strip()]
|
||||
if args.sectors
|
||||
else None
|
||||
)
|
||||
result = build_index(args.input, postcodes_path=args.postcodes, sectors=sectors)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,10 @@ import polars as pl
|
|||
|
||||
from pipeline.transform.price_estimation import index as index_mod
|
||||
from pipeline.transform.price_estimation.index import (
|
||||
MAX_EXTRAPOLATION_SLOPE,
|
||||
compute_indices_for_level,
|
||||
extract_pairs,
|
||||
forward_fill,
|
||||
solve_robust_index,
|
||||
)
|
||||
|
||||
|
|
@ -105,6 +108,139 @@ def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
|
|||
assert abs(idx[2015] - true[2015]) < 0.05
|
||||
|
||||
|
||||
def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int):
|
||||
"""Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent
|
||||
pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump."""
|
||||
years = range(2010, 2021)
|
||||
true = {y: 0.04 * (y - 2010) for y in years}
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
for a in range(2010, 2020):
|
||||
for _ in range(ramp_reps):
|
||||
y1.append(a)
|
||||
y2.append(a + 1)
|
||||
lr.append(true[a + 1] - true[a])
|
||||
w.append(1.0)
|
||||
for _ in range(tail_n):
|
||||
y1.append(2020)
|
||||
y2.append(2021)
|
||||
lr.append(tail_ratio)
|
||||
w.append(1.0)
|
||||
return (
|
||||
np.array(y1, dtype=np.int32),
|
||||
np.array(y2, dtype=np.int32),
|
||||
np.array(lr, dtype=np.float64),
|
||||
np.array(w, dtype=np.float64),
|
||||
)
|
||||
|
||||
|
||||
def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch):
|
||||
"""A final year identified by a SINGLE pair claiming a +1.5 log jump is
|
||||
pulled strongly toward the local trend; with the flat baseline penalty
|
||||
(support scaling off) the jump survives almost entirely. The thin year is
|
||||
the LAST year of the range (only ever at a penalty triple's edge), proving
|
||||
the min-over-triple support rule covers range edges -- the last solved year
|
||||
feeds the CURRENT_YEAR trend extrapolation."""
|
||||
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10)
|
||||
|
||||
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
|
||||
flat = solve_robust_index(y1, y2, lr, w)
|
||||
monkeypatch.undo()
|
||||
scaled = solve_robust_index(y1, y2, lr, w)
|
||||
|
||||
flat_step = flat[2021] - flat[2020]
|
||||
scaled_step = scaled[2021] - scaled[2020]
|
||||
assert flat_step > 1.2 # flat lambda barely resists the spike
|
||||
assert scaled_step < 0.65 # support-scaled lambda suppresses it
|
||||
# The well-supported ramp stays close to truth: the strong penalty row
|
||||
# spanning the thin year drags its immediate neighbour slightly (<0.1)
|
||||
# toward collinearity -- the price of suppressing a x4.5 one-year spike.
|
||||
for y in range(2010, 2021):
|
||||
assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1
|
||||
|
||||
|
||||
def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch):
|
||||
"""With ample pairs everywhere (support 50-100 per year), lambda_eff ~
|
||||
lambda0 and the solution matches the flat-penalty solve to <1e-3."""
|
||||
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50)
|
||||
|
||||
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
|
||||
flat = solve_robust_index(y1, y2, lr, w)
|
||||
monkeypatch.undo()
|
||||
scaled = solve_robust_index(y1, y2, lr, w)
|
||||
|
||||
assert set(flat) == set(scaled)
|
||||
assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3
|
||||
|
||||
|
||||
def test_forward_fill_extrapolation_uses_robust_median_slope():
|
||||
"""A residual spike in ONE recent year must not corrupt the extrapolated
|
||||
step: the median of consecutive per-year slopes ignores it (a least-squares
|
||||
fit through the same points would extrapolate a large positive slope)."""
|
||||
index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10}
|
||||
filled = forward_fill(index, 2022, 2026)
|
||||
# slopes: [+0.05, +0.55, -0.50] -> median +0.05
|
||||
assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9
|
||||
|
||||
|
||||
def test_forward_fill_extrapolated_slope_is_clamped():
|
||||
"""A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE."""
|
||||
index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2}
|
||||
filled = forward_fill(index, 2022, 2026)
|
||||
assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9
|
||||
|
||||
index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0}
|
||||
filled_down = forward_fill(index_down, 2022, 2026)
|
||||
assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9
|
||||
|
||||
|
||||
def test_forward_fill_preserves_sane_trend_and_flat_fallback():
|
||||
"""Genuine moderate trends still extrapolate (it stays a forward-FILL-with-
|
||||
trend); with <2 recent points the fill is flat."""
|
||||
index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15}
|
||||
filled = forward_fill(index, 2022, 2026)
|
||||
assert abs(filled[2026] - 1.20) < 1e-9
|
||||
|
||||
assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7
|
||||
|
||||
|
||||
def test_extract_pairs_drops_extreme_annualised_returns(tmp_path):
|
||||
"""A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data
|
||||
error or non-market transfer with huge leverage (weight = 1/sqrt(gap)).
|
||||
Such pairs are dropped via the annualised cap; large ratios over long
|
||||
holding periods (genuine appreciation) are kept."""
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"],
|
||||
"Property type": ["Detached", "Detached", "Detached"],
|
||||
"historical_prices": [
|
||||
# +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr)
|
||||
[
|
||||
{"year": 2020, "month": 1, "price": 100_000},
|
||||
{"year": 2020, "month": 7, "price": 1_000_000},
|
||||
],
|
||||
# +2.20 log over 24 years -> kept (flat 3.0 cap governs)
|
||||
[
|
||||
{"year": 2000, "month": 1, "price": 100_000},
|
||||
{"year": 2024, "month": 1, "price": 900_000},
|
||||
],
|
||||
# +0.41 log in 1 year -> kept (within the 0.7/yr band)
|
||||
[
|
||||
{"year": 2020, "month": 1, "price": 100_000},
|
||||
{"year": 2021, "month": 1, "price": 150_000},
|
||||
],
|
||||
],
|
||||
}
|
||||
)
|
||||
path = tmp_path / "props.parquet"
|
||||
df.write_parquet(path)
|
||||
|
||||
pairs = extract_pairs(path)
|
||||
|
||||
assert len(pairs) == 2
|
||||
ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list())
|
||||
assert ratios == [0.41, 2.2]
|
||||
|
||||
|
||||
def test_n_pairs_counts_only_cross_year_pairs():
|
||||
"""FIX #12: same-year pairs carry zero index information and must not inflate
|
||||
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
|
||||
|
|
|
|||
|
|
@ -36,6 +36,20 @@ SHRINKAGE_K = 50
|
|||
# noisy year) without flattening genuine multi-year trends.
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
|
||||
|
||||
# Per-year support scaling for the temporal smoothness penalty. A flat lambda
|
||||
# is too weak for years with very few repeat-sale pairs: a sector can have
|
||||
# hundreds of pairs overall (so cell-level n/(n+k) shrinkage barely moves it)
|
||||
# yet have individual years estimated from 1-2 pairs, producing 2-7x
|
||||
# single-year index spikes. Each curvature row is therefore scaled by the
|
||||
# local pair support of its year triple:
|
||||
# lambda_eff = lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s)
|
||||
# where s is the minimum cross-year pair count among the triple's years.
|
||||
# Well-supported years (s >> SMOOTHNESS_SUPPORT_PAIRS) keep lambda_eff ~
|
||||
# lambda0 (current behaviour); a year identified by a single pair gets
|
||||
# ~41x lambda0, pulling its beta strongly toward the local trend through its
|
||||
# neighbours. Same-year pairs cancel in the design and are not counted.
|
||||
SMOOTHNESS_SUPPORT_PAIRS = 40
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type -> type_group."""
|
||||
|
|
|
|||
748
pipeline/transform/school_catchments.py
Normal file
748
pipeline/transform/school_catchments.py
Normal file
|
|
@ -0,0 +1,748 @@
|
|||
"""Model historical school catchment areas and count them per postcode.
|
||||
|
||||
No national dataset of school catchment areas exists for England: catchments
|
||||
are set per admission authority, only a handful of councils publish polygons,
|
||||
and the pupil-residence data behind commercial "heatmap" catchments lives in
|
||||
the restricted National Pupil Database. This module therefore COMPILES one
|
||||
from open data, estimating each school's admission cutoff distance ("last
|
||||
distance offered") — the radius within which an applicant would plausibly be
|
||||
offered a place.
|
||||
|
||||
Model: English state admissions are run as deferred acceptance with distance
|
||||
tie-breaks, which in a continuum economy is equivalent to finding
|
||||
market-clearing cutoff distances (Azevedo & Leshno 2016). Per phase
|
||||
(primary/secondary):
|
||||
|
||||
1. Demand — Census 2021 children per LSOA (TS007A age bands, prorated to the
|
||||
phase's cohort ages) split evenly across the LSOA's live postcodes.
|
||||
2. Supply — every open, non-selective state-funded school (GIAS), with a fill
|
||||
target of max(capacity, headcount) prorated to the phase's cohorts
|
||||
(sixth-form and nursery years carry reduced weight, since their class
|
||||
sizes differ and they are not allocated by the same admissions round).
|
||||
3. Preferences — children prefer nearby schools, trading distance against
|
||||
Ofsted grade: a school's effective distance is its real distance minus a
|
||||
grade bonus (Outstanding > Good > ungraded > below-Good). Because real
|
||||
first preferences are heterogeneous, each postcode's children split
|
||||
across nearby feasible schools with logit weights over effective
|
||||
distance rather than all picking the same one.
|
||||
4. Equilibrium — cutoffs start unbounded and tighten monotonically: each
|
||||
round, children apply to their preferred feasible school(s), and
|
||||
oversubscribed schools tighten their cutoff to the distance of their
|
||||
marginal admitted child. Converges to the deferred-acceptance outcome.
|
||||
5. Schools that never fill have no binding cutoff — anyone who applies gets
|
||||
in — so their feasibility radius is the distance within which the local
|
||||
child population would cover their fill target, capped.
|
||||
|
||||
The free parameters (preference bonuses, demand scale, choice temperature,
|
||||
residual calibration factors) are CALIBRATED against published "last
|
||||
distance offered" figures scraped from nine local authorities' allocation
|
||||
reports — see check_school_cutoffs.py and the constants below.
|
||||
|
||||
A postcode is "inside the catchment" of every school whose cutoff radius
|
||||
covers it. The output counts those schools per postcode for the four
|
||||
good+/outstanding x primary/secondary categories (Ofsted-classified, same
|
||||
rules as the previous proximity metric). Selective (grammar) schools are
|
||||
excluded throughout: their intakes are test-based and region-wide, so a
|
||||
distance model would fabricate a catchment that does not exist.
|
||||
|
||||
Known limitations: faith oversubscription criteria are not modelled (whether
|
||||
a faith school's catchment is open to a given family depends on the family),
|
||||
and Census 2021 child counts lag current rolls slightly. Cutoffs are
|
||||
straight-line distances, the modal LA tie-break criterion.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
from pipeline.utils.poi_counts import _project_lat_lng_km, valid_uk_coords_mask
|
||||
|
||||
SCHOOL_GROUPS = {
|
||||
"good_primary": ["good_primary", "outstanding_primary"],
|
||||
"good_secondary": ["good_secondary", "outstanding_secondary"],
|
||||
"outstanding_primary": ["outstanding_primary"],
|
||||
"outstanding_secondary": ["outstanding_secondary"],
|
||||
}
|
||||
|
||||
# Age thresholds for deciding which phase(s) a school serves. A school serves
|
||||
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
|
||||
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
|
||||
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
|
||||
# both the primary and the secondary metrics — Ofsted's coarse "Ofsted phase"
|
||||
# labels such schools as just "Secondary", which previously hid them from every
|
||||
# postcode's primary-school count.
|
||||
PRIMARY_MAX_AGE = 10
|
||||
SECONDARY_MIN_AGE = 12
|
||||
|
||||
# Cohort ages (inclusive) each phase competes for: Reception-Y6 and Y7-Y11.
|
||||
PRIMARY_AGES = (4, 10)
|
||||
SECONDARY_AGES = (11, 15)
|
||||
|
||||
# Cohort weights for prorating a school's headcount/capacity across the ages
|
||||
# it teaches. Nursery classes are typically part-time and small; sixth forms
|
||||
# run at roughly 60% of a school's Y7-Y11 cohort size. A flat proration
|
||||
# undersupplied secondary places by ~8%.
|
||||
NURSERY_COHORT_WEIGHT = 0.5 # ages < 4
|
||||
SIXTH_FORM_COHORT_WEIGHT = 0.6 # ages >= 16
|
||||
|
||||
# Only schools that admit (mostly) by geography take part in the assignment.
|
||||
# Independent, special and Welsh schools and post-16 colleges either don't
|
||||
# admit by distance or fall outside the England postcode universe; selective
|
||||
# (grammar) schools admit by test from a wide region.
|
||||
STATE_SCHOOL_TYPE_GROUPS = [
|
||||
"Academies",
|
||||
"Local authority maintained schools",
|
||||
"Free Schools",
|
||||
]
|
||||
|
||||
# Preference bonuses (km of extra travel a family accepts for a better
|
||||
# school), applied as a discount on effective distance when children choose.
|
||||
# Grade 3/4 schools repel by the same magnitudes.
|
||||
PREF_BONUS_OUTSTANDING_KM = 0.6
|
||||
PREF_BONUS_GOOD_KM = 0.3
|
||||
|
||||
# Share of resident children who actually compete for state places. Census
|
||||
# 2021 counts overstate current entry cohorts (birth rates fell ~10% between
|
||||
# 2016 and 2021, which is exactly the gap between the census stock and the
|
||||
# children reaching Reception by mid-decade) and independent/home-educated
|
||||
# children (~7%) never enter the allocation at all. Without this, modelled
|
||||
# cutoffs run systematically tight and undersubscribed schools look full.
|
||||
DEMAND_SCALE = 0.8
|
||||
|
||||
# Logit choice temperature (km). With deterministic choice every child at a
|
||||
# postcode ranks the same school first, so popular schools fill entirely from
|
||||
# their nearest band and the marginal admitted child sits unrealistically
|
||||
# close. Real first preferences are heterogeneous; a school draws only a
|
||||
# distance-decaying share of nearby families. Children therefore split across
|
||||
# nearby feasible schools with weights softmax(-effective_distance / tau):
|
||||
# higher tau = more smearing = wider cutoffs. tau -> 0 recovers the
|
||||
# deterministic model (used by the unit tests). Calibrated 2026-06 against
|
||||
# 240 published binding cutoffs from 9 LAs (check_school_cutoffs.py): 0.3 km
|
||||
# maximises rank correlation and within-2x share; beyond ~0.6 the smearing
|
||||
# erases school-to-school differentiation (Spearman 0.24 -> 0.01).
|
||||
CHOICE_TEMPERATURE_KM = 0.3
|
||||
|
||||
# Residual calibration from the same ground truth: after the equilibrium
|
||||
# solve, modelled cutoffs still ran systematically tight (median log2 bias
|
||||
# -0.53 primary / -0.36 secondary at the settings above — published "last
|
||||
# distance offered" reflects offer-day frictions, waiting-list churn and
|
||||
# furthest-applicant noise that no clean equilibrium reproduces). Radii are
|
||||
# multiplied by 2^-bias so the modelled median matches the published median;
|
||||
# rank ordering is unaffected.
|
||||
CUTOFF_CALIBRATION_FACTOR = {"primary": 1.44, "secondary": 1.28}
|
||||
|
||||
# Each demand postcode considers this many nearest schools; beyond ~16
|
||||
# candidates assignment shares are negligible.
|
||||
NEAREST_SCHOOL_CANDIDATES = 16
|
||||
|
||||
# Radius guard rails: the floor absorbs postcode-centroid noise around tiny
|
||||
# urban catchments; the cap bounds feasibility radii for schools the model
|
||||
# never fills (mostly rural).
|
||||
MIN_RADIUS_KM = 0.3
|
||||
MAX_RADIUS_KM = 25.0
|
||||
|
||||
EQUILIBRIUM_MAX_ITER = 100
|
||||
|
||||
|
||||
def classify_good_plus_schools(
|
||||
ofsted: pl.DataFrame, open_urns: set[int] | None = None
|
||||
) -> pl.DataFrame:
|
||||
"""Label good+/outstanding primary & secondary schools for catchment counts.
|
||||
|
||||
Derives a grade ("1" = outstanding, "2" = good) and one or two
|
||||
``category`` rows per school, returning a ``(urn, category)`` frame.
|
||||
|
||||
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
||||
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
Framework). A large and growing share of schools were last inspected under an
|
||||
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
||||
that column is null/"Not judged" for them even when they are demonstrably
|
||||
good — their status lives in "Ungraded inspection overall outcome" ("School
|
||||
remains Good"/"School remains Outstanding"). Filtering on the graded column
|
||||
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
|
||||
ungraded outcome, but ONLY when there is no usable graded result
|
||||
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
|
||||
|
||||
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
|
||||
(Concerns)" outcome signals inspectors found issues warranting an earlier
|
||||
graded re-inspection, so marketing it as a good+ school is misleading.
|
||||
|
||||
Phase assignment uses the statutory age range when available (so all-through
|
||||
and middle schools count toward BOTH primary and secondary), falling back to
|
||||
the coarse "Ofsted phase" label when age columns are absent. When
|
||||
``open_urns`` is given, schools whose URN is not in the current GIAS open
|
||||
register are dropped so closed/merged schools are not counted.
|
||||
"""
|
||||
graded = _with_derived_grade(ofsted).filter(
|
||||
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
|
||||
& pl.col("_ofsted_grade").is_in(["1", "2"])
|
||||
)
|
||||
|
||||
# Drop schools no longer open (closed/merged) when the GIAS open register is
|
||||
# provided, so stale Ofsted "latest inspection" rows are not counted.
|
||||
if open_urns is not None and "URN" in graded.columns:
|
||||
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
|
||||
|
||||
# Decide which phase(s) each school serves.
|
||||
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
|
||||
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
|
||||
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
|
||||
serves_primary = (
|
||||
pl.when(low.is_not_null())
|
||||
.then(low <= PRIMARY_MAX_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Primary")
|
||||
)
|
||||
serves_secondary = (
|
||||
pl.when(high.is_not_null())
|
||||
.then(high >= SECONDARY_MIN_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Secondary")
|
||||
)
|
||||
else:
|
||||
serves_primary = pl.col("Ofsted phase") == "Primary"
|
||||
serves_secondary = pl.col("Ofsted phase") == "Secondary"
|
||||
|
||||
graded = graded.with_columns(
|
||||
serves_primary.alias("_serves_primary"),
|
||||
serves_secondary.alias("_serves_secondary"),
|
||||
)
|
||||
|
||||
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
||||
# A school can yield up to two rows (primary and secondary).
|
||||
primary = graded.filter(pl.col("_serves_primary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
.alias("category")
|
||||
)
|
||||
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
.alias("category")
|
||||
)
|
||||
return pl.concat([primary, secondary]).select(
|
||||
pl.col("URN").cast(pl.Int64).alias("urn"),
|
||||
"category",
|
||||
)
|
||||
|
||||
|
||||
def _with_derived_grade(ofsted: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Attach ``_ofsted_grade`` ("1"-"4" or null): graded OEIF result first,
|
||||
falling back to ungraded "School remains Good/Outstanding" outcomes (minus
|
||||
"(Concerns)") only when there is no usable graded result."""
|
||||
# Cast to Utf8 so the string predicates below are well-defined even if a
|
||||
# column happens to be entirely null (read back as a Null dtype).
|
||||
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
||||
has_concern = ungraded.str.contains(r"\(Concerns\)")
|
||||
remains_outstanding = (
|
||||
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
|
||||
)
|
||||
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
|
||||
return ofsted.with_columns(
|
||||
pl.when(oeif.is_in(["1", "2", "3", "4"]))
|
||||
.then(oeif)
|
||||
.when(no_usable_grade & remains_outstanding)
|
||||
.then(pl.lit("1"))
|
||||
.when(no_usable_grade & remains_good)
|
||||
.then(pl.lit("2"))
|
||||
.otherwise(None)
|
||||
.alias("_ofsted_grade")
|
||||
)
|
||||
|
||||
|
||||
def school_preference_bonuses(
|
||||
ofsted: pl.DataFrame,
|
||||
bonus_outstanding_km: float = PREF_BONUS_OUTSTANDING_KM,
|
||||
bonus_good_km: float = PREF_BONUS_GOOD_KM,
|
||||
) -> pl.DataFrame:
|
||||
"""Per-school preference bonus in km, from the derived Ofsted grade.
|
||||
|
||||
Outstanding/Good schools attract demand from further away; grade 3/4
|
||||
schools repel it symmetrically. Ungraded (typically new) schools are
|
||||
neutral. Returns ``(urn, bonus_km)`` with one row per URN.
|
||||
"""
|
||||
bonus = {
|
||||
"1": bonus_outstanding_km,
|
||||
"2": bonus_good_km,
|
||||
"3": -bonus_good_km,
|
||||
"4": -bonus_outstanding_km,
|
||||
}
|
||||
return (
|
||||
_with_derived_grade(ofsted)
|
||||
.filter(pl.col("URN").is_not_null())
|
||||
.select(
|
||||
pl.col("URN").cast(pl.Int64).alias("urn"),
|
||||
pl.col("_ofsted_grade")
|
||||
.replace_strict(bonus, default=0.0, return_dtype=pl.Float64)
|
||||
.alias("bonus_km"),
|
||||
)
|
||||
.unique(subset="urn", keep="first")
|
||||
)
|
||||
|
||||
|
||||
def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Per-school phase-prorated fill targets for the admissions model.
|
||||
|
||||
Returns one row per open, non-selective state-funded school with valid
|
||||
coordinates: ``(urn, lat, lng, primary_intake, secondary_intake)``. The
|
||||
fill target — max(capacity, headcount), so over-full schools keep their
|
||||
demonstrated size and under-full schools can admit up to capacity — is
|
||||
spread over the cohort ages the school teaches (parsed from ``age_range``,
|
||||
e.g. "3–11" = ages 3..10) with nursery and sixth-form ages down-weighted,
|
||||
and each phase receives the share of cohort weight in its age band.
|
||||
"""
|
||||
ages = pl.col("age_range").str.extract_all(r"\d+")
|
||||
low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
|
||||
# The leaving age is exclusive as a cohort: a "3-11" school teaches
|
||||
# children aged 3 through 10.
|
||||
high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
|
||||
|
||||
schools = (
|
||||
gias.filter(
|
||||
pl.col("type_group").is_in(STATE_SCHOOL_TYPE_GROUPS)
|
||||
& (
|
||||
pl.col("admissions_policy").is_null()
|
||||
| (pl.col("admissions_policy") != "Selective")
|
||||
)
|
||||
& pl.col("lat").is_not_null()
|
||||
& pl.col("lng").is_not_null()
|
||||
)
|
||||
.with_columns(low.alias("_low"), high.alias("_high"))
|
||||
.filter(pl.col("_low").is_not_null() & (pl.col("_high") >= pl.col("_low")))
|
||||
.with_columns(
|
||||
pl.max_horizontal(
|
||||
pl.col("pupils").fill_null(0), pl.col("capacity").fill_null(0)
|
||||
)
|
||||
.cast(pl.Float64)
|
||||
.alias("_fill_target"),
|
||||
)
|
||||
.filter(pl.col("_fill_target") > 0)
|
||||
)
|
||||
|
||||
def weighted_overlap(lo: int, hi: int, weight: float = 1.0) -> pl.Expr:
|
||||
"""Cohort weight contributed by ages [lo, hi] within [_low, _high]."""
|
||||
return (
|
||||
weight
|
||||
* (
|
||||
pl.min_horizontal(pl.col("_high"), hi)
|
||||
- pl.max_horizontal(pl.col("_low"), lo)
|
||||
+ 1
|
||||
).clip(lower_bound=0)
|
||||
).cast(pl.Float64)
|
||||
|
||||
total_weight = (
|
||||
weighted_overlap(0, 3, NURSERY_COHORT_WEIGHT)
|
||||
+ weighted_overlap(4, 15)
|
||||
+ weighted_overlap(16, 30, SIXTH_FORM_COHORT_WEIGHT)
|
||||
)
|
||||
return schools.select(
|
||||
pl.col("urn").cast(pl.Int64),
|
||||
"lat",
|
||||
"lng",
|
||||
(pl.col("_fill_target") * weighted_overlap(*PRIMARY_AGES) / total_weight).alias(
|
||||
"primary_intake"
|
||||
),
|
||||
(
|
||||
pl.col("_fill_target") * weighted_overlap(*SECONDARY_AGES) / total_weight
|
||||
).alias("secondary_intake"),
|
||||
)
|
||||
|
||||
|
||||
def children_per_postcode(
|
||||
postcodes: pl.DataFrame, lsoa_children: pl.DataFrame
|
||||
) -> pl.DataFrame:
|
||||
"""Estimate phase-age children living at each live postcode.
|
||||
|
||||
Census age bands don't align with school phases, so phase totals take
|
||||
fractional shares of bands (one fifth per single year of age): primary
|
||||
(4-10) = age 4 + ages 5-9 + age 10, secondary (11-15) = ages 11-14 +
|
||||
age 15. LSOA totals are then split evenly across the LSOA's postcodes.
|
||||
"""
|
||||
lsoa = lsoa_children.select(
|
||||
"lsoa21",
|
||||
(
|
||||
0.2 * pl.col("aged_0_4") + pl.col("aged_5_9") + 0.2 * pl.col("aged_10_14")
|
||||
).alias("_lsoa_primary"),
|
||||
(0.8 * pl.col("aged_10_14") + 0.2 * pl.col("aged_15_19")).alias(
|
||||
"_lsoa_secondary"
|
||||
),
|
||||
)
|
||||
return (
|
||||
postcodes.join(lsoa, left_on="lsoa21cd", right_on="lsoa21", how="inner")
|
||||
.with_columns(pl.len().over("lsoa21cd").alias("_lsoa_postcodes"))
|
||||
.select(
|
||||
"postcode",
|
||||
"lat",
|
||||
"lng",
|
||||
(pl.col("_lsoa_primary") / pl.col("_lsoa_postcodes")).alias(
|
||||
"primary_children"
|
||||
),
|
||||
(pl.col("_lsoa_secondary") / pl.col("_lsoa_postcodes")).alias(
|
||||
"secondary_children"
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def equilibrium_cutoffs(
|
||||
school_xy: np.ndarray,
|
||||
fill_target: np.ndarray,
|
||||
bonus_km: np.ndarray,
|
||||
pc_xy: np.ndarray,
|
||||
pc_children: np.ndarray,
|
||||
k: int = NEAREST_SCHOOL_CANDIDATES,
|
||||
max_iter: int = EQUILIBRIUM_MAX_ITER,
|
||||
tau_km: float = CHOICE_TEMPERATURE_KM,
|
||||
) -> np.ndarray:
|
||||
"""Market-clearing admission cutoff distance (km) per school.
|
||||
|
||||
Deferred acceptance with distance priority, solved as cutoff dynamics
|
||||
(Azevedo & Leshno): cutoffs start unbounded; each round every child unit
|
||||
applies to its preferred feasible school(s) — a logit split over
|
||||
effective distance (distance - school bonus) among schools whose cutoff
|
||||
covers it, collapsing to the single best school when ``tau_km`` is 0 —
|
||||
and each oversubscribed school tightens its cutoff to its marginal
|
||||
admitted child's distance. Cutoffs only ever tighten, so the iteration
|
||||
converges.
|
||||
|
||||
Returns np.inf for schools that never fill (no binding cutoff).
|
||||
"""
|
||||
n_schools = len(school_xy)
|
||||
k = min(k, n_schools)
|
||||
demand = np.flatnonzero(pc_children > 0)
|
||||
weights = pc_children[demand]
|
||||
tree = cKDTree(school_xy)
|
||||
dist, cand = tree.query(pc_xy[demand], k=k, workers=-1)
|
||||
if k == 1:
|
||||
dist = dist[:, None]
|
||||
cand = cand[:, None]
|
||||
eff = dist - bonus_km[cand]
|
||||
|
||||
rows = np.arange(len(demand))
|
||||
cutoff = np.full(n_schools, np.inf)
|
||||
for _ in range(max_iter):
|
||||
eff_feasible = np.where(dist <= cutoff[cand], eff, np.inf)
|
||||
if tau_km <= 0:
|
||||
choice = np.argmin(eff_feasible, axis=1)
|
||||
valid = np.isfinite(eff_feasible[rows, choice])
|
||||
chosen_school = cand[rows[valid], choice[valid]]
|
||||
chosen_dist = dist[rows[valid], choice[valid]]
|
||||
chosen_mass = weights[valid]
|
||||
else:
|
||||
z = -eff_feasible / tau_km
|
||||
z_max = z.max(axis=1, keepdims=True)
|
||||
share = np.exp(z - np.where(np.isfinite(z_max), z_max, 0.0))
|
||||
share[~np.isfinite(eff_feasible)] = 0.0
|
||||
total = share.sum(axis=1, keepdims=True)
|
||||
mass = weights[:, None] * share / np.where(total > 0, total, 1.0)
|
||||
# Sub-thousandth-of-a-child applications only slow the sort down.
|
||||
keep = mass > 1e-3
|
||||
chosen_school = cand[keep]
|
||||
chosen_dist = dist[keep]
|
||||
chosen_mass = mass[keep]
|
||||
|
||||
order = np.lexsort((chosen_dist, chosen_school))
|
||||
s_sorted = chosen_school[order]
|
||||
d_sorted = chosen_dist[order]
|
||||
m_cum = np.cumsum(chosen_mass[order])
|
||||
boundaries = np.flatnonzero(np.diff(s_sorted)) + 1
|
||||
starts = np.concatenate(([0], boundaries))
|
||||
ends = np.concatenate((boundaries, [len(s_sorted)]))
|
||||
|
||||
changed = False
|
||||
for start, end in zip(starts, ends):
|
||||
school = s_sorted[start]
|
||||
seg_cum = m_cum[start:end] - (m_cum[start - 1] if start else 0.0)
|
||||
if seg_cum[-1] <= fill_target[school]:
|
||||
continue
|
||||
marginal = d_sorted[start + np.searchsorted(seg_cum, fill_target[school])]
|
||||
if marginal < cutoff[school]:
|
||||
cutoff[school] = marginal
|
||||
changed = True
|
||||
if not changed:
|
||||
break
|
||||
|
||||
return cutoff
|
||||
|
||||
|
||||
def capacity_fill_radii(
|
||||
school_xy: np.ndarray,
|
||||
fill_target: np.ndarray,
|
||||
pc_xy: np.ndarray,
|
||||
pc_children: np.ndarray,
|
||||
max_radius_km: float = MAX_RADIUS_KM,
|
||||
) -> np.ndarray:
|
||||
"""Feasibility radius for schools without a binding cutoff.
|
||||
|
||||
An undersubscribed school admits anyone who applies, so its catchment is
|
||||
bounded by plausibility rather than competition: the distance within
|
||||
which the local child population would cover its fill target. Capped at
|
||||
``max_radius_km``.
|
||||
"""
|
||||
demand = np.flatnonzero(pc_children > 0)
|
||||
tree = cKDTree(pc_xy[demand])
|
||||
radii = np.full(len(school_xy), max_radius_km)
|
||||
k = min(4096, len(demand))
|
||||
for i in range(len(school_xy)):
|
||||
dists, idx = tree.query(
|
||||
school_xy[i], k=k, distance_upper_bound=max_radius_km
|
||||
)
|
||||
found = np.isfinite(dists)
|
||||
cum = np.cumsum(pc_children[demand[idx[found]]])
|
||||
if len(cum) and cum[-1] >= fill_target[i]:
|
||||
radii[i] = dists[found][np.searchsorted(cum, fill_target[i])]
|
||||
return radii
|
||||
|
||||
|
||||
def count_covering_catchments(
|
||||
pc_xy: np.ndarray,
|
||||
pc_valid: np.ndarray,
|
||||
school_xy: np.ndarray,
|
||||
school_radii: np.ndarray,
|
||||
n_postcodes: int,
|
||||
) -> np.ndarray:
|
||||
"""Count, per postcode, how many schools' catchment radii cover it."""
|
||||
counts = np.zeros(n_postcodes, dtype=np.int32)
|
||||
if len(school_xy) == 0:
|
||||
return counts
|
||||
valid_indices = np.flatnonzero(pc_valid)
|
||||
tree = cKDTree(pc_xy[valid_indices])
|
||||
covered = np.zeros(len(valid_indices), dtype=np.int32)
|
||||
for indices in tree.query_ball_point(school_xy, school_radii, workers=-1):
|
||||
covered[indices] += 1
|
||||
counts[valid_indices] = covered
|
||||
return counts
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Model school admission cutoff radii and count good+/outstanding "
|
||||
"primary/secondary catchments covering each postcode"
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gias", type=Path, required=True, help="GIAS open-school parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lsoa-children",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Census 2021 children by LSOA parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Per-postcode counts parquet; omit for calibration runs that only "
|
||||
"need --schools-output",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--schools-output",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional per-school catchment radii parquet (for calibration/debugging)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bonus-outstanding-km",
|
||||
type=float,
|
||||
default=PREF_BONUS_OUTSTANDING_KM,
|
||||
help="Preference bonus for Outstanding schools (calibration sweeps)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bonus-good-km",
|
||||
type=float,
|
||||
default=PREF_BONUS_GOOD_KM,
|
||||
help="Preference bonus for Good schools (calibration sweeps)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--demand-scale",
|
||||
type=float,
|
||||
default=DEMAND_SCALE,
|
||||
help="Share of resident children competing for state places",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--choice-temperature-km",
|
||||
type=float,
|
||||
default=CHOICE_TEMPERATURE_KM,
|
||||
help="Logit choice temperature over effective distance",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
gias = pl.read_parquet(args.gias)
|
||||
open_urns = set(
|
||||
gias.select(pl.col("urn").cast(pl.Int64, strict=False))
|
||||
.to_series()
|
||||
.drop_nulls()
|
||||
.to_list()
|
||||
)
|
||||
print(f"GIAS open register: {len(open_urns):,} open school URNs")
|
||||
|
||||
ofsted = pl.read_parquet(args.ofsted)
|
||||
rated = classify_good_plus_schools(ofsted, open_urns=open_urns)
|
||||
if rated.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
print(f"Good+ school/phase rows: {len(rated):,}")
|
||||
|
||||
supply = phase_intakes(gias).join(
|
||||
school_preference_bonuses(
|
||||
ofsted,
|
||||
bonus_outstanding_km=args.bonus_outstanding_km,
|
||||
bonus_good_km=args.bonus_good_km,
|
||||
),
|
||||
on="urn",
|
||||
how="left",
|
||||
).with_columns(pl.col("bonus_km").fill_null(0.0))
|
||||
print(f"State schools in admissions model: {len(supply):,}")
|
||||
|
||||
arcgis = pl.read_parquet(args.arcgis).select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
"lat",
|
||||
pl.col("long").alias("lng"),
|
||||
"lsoa21cd",
|
||||
"doterm",
|
||||
)
|
||||
live = arcgis.filter(
|
||||
pl.col("doterm").is_null() & pl.col("lsoa21cd").str.starts_with("E")
|
||||
)
|
||||
demand = children_per_postcode(live, pl.read_parquet(args.lsoa_children))
|
||||
print(
|
||||
f"Demand postcodes: {len(demand):,} "
|
||||
f"({demand['primary_children'].sum():,.0f} primary-age, "
|
||||
f"{demand['secondary_children'].sum():,.0f} secondary-age children)"
|
||||
)
|
||||
|
||||
# Shared local-km projection so assignment and coverage use one metric.
|
||||
pc_lats = arcgis["lat"].to_numpy()
|
||||
pc_lngs = arcgis["lng"].to_numpy()
|
||||
pc_valid = valid_uk_coords_mask(pc_lats, pc_lngs)
|
||||
origin_lat = float(np.mean(pc_lats[pc_valid]))
|
||||
pc_xy = _project_lat_lng_km(pc_lats, pc_lngs, origin_lat)
|
||||
|
||||
demand_lats = demand["lat"].to_numpy()
|
||||
demand_lngs = demand["lng"].to_numpy()
|
||||
demand_valid = valid_uk_coords_mask(demand_lats, demand_lngs)
|
||||
demand_xy = _project_lat_lng_km(demand_lats, demand_lngs, origin_lat)
|
||||
|
||||
school_xy = _project_lat_lng_km(
|
||||
supply["lat"].to_numpy(), supply["lng"].to_numpy(), origin_lat
|
||||
)
|
||||
|
||||
radii = {}
|
||||
for phase in ("primary", "secondary"):
|
||||
in_phase = supply[f"{phase}_intake"].to_numpy() > 0
|
||||
targets = supply[f"{phase}_intake"].to_numpy()[in_phase]
|
||||
xy = school_xy[in_phase]
|
||||
children = np.where(
|
||||
demand_valid,
|
||||
demand[f"{phase}_children"].to_numpy() * args.demand_scale,
|
||||
0.0,
|
||||
)
|
||||
print(f"Solving {phase} admissions for {in_phase.sum():,} schools...")
|
||||
cutoffs = equilibrium_cutoffs(
|
||||
xy,
|
||||
targets,
|
||||
supply["bonus_km"].to_numpy()[in_phase],
|
||||
demand_xy,
|
||||
children,
|
||||
tau_km=args.choice_temperature_km,
|
||||
)
|
||||
filled = np.isfinite(cutoffs)
|
||||
print(
|
||||
f" {filled.sum():,} schools have binding cutoffs "
|
||||
f"(median {np.median(cutoffs[filled]):.2f} km); "
|
||||
f"{(~filled).sum():,} undersubscribed"
|
||||
)
|
||||
fallback = capacity_fill_radii(
|
||||
xy[~filled], targets[~filled], demand_xy, children
|
||||
)
|
||||
raw = cutoffs.copy()
|
||||
raw[~filled] = fallback
|
||||
radii[phase] = pl.DataFrame(
|
||||
{
|
||||
"urn": supply["urn"].to_numpy()[in_phase],
|
||||
"phase": phase,
|
||||
"cutoff_km": raw,
|
||||
"filled": filled,
|
||||
"radius_km": np.clip(
|
||||
raw * CUTOFF_CALIBRATION_FACTOR[phase],
|
||||
MIN_RADIUS_KM,
|
||||
MAX_RADIUS_KM,
|
||||
),
|
||||
}
|
||||
)
|
||||
print(
|
||||
f" radius km: median {radii[phase]['radius_km'].median():.2f}, "
|
||||
f"p90 {radii[phase]['radius_km'].quantile(0.9):.2f}"
|
||||
)
|
||||
|
||||
# Attach each rated school's phase radius; rated schools outside the
|
||||
# admissions model (special schools, selective schools, missing
|
||||
# headcounts) cannot be given a defensible radius and are dropped.
|
||||
rated = rated.with_columns(
|
||||
pl.col("category").str.split("_").list.get(1).alias("phase")
|
||||
)
|
||||
rated_with_radius = rated.join(
|
||||
pl.concat(list(radii.values())), on=["urn", "phase"], how="inner"
|
||||
).join(supply.select("urn", "lat", "lng"), on="urn", how="inner")
|
||||
dropped = len(rated) - len(rated_with_radius)
|
||||
print(
|
||||
f"Rated school/phase rows with radii: {len(rated_with_radius):,} "
|
||||
f"(dropped {dropped:,}, incl. selective schools)"
|
||||
)
|
||||
|
||||
if args.output is None and args.schools_output is None:
|
||||
raise SystemExit("Provide --output and/or --schools-output")
|
||||
|
||||
if args.output is not None:
|
||||
category_counts = {}
|
||||
for category in set(c for cats in SCHOOL_GROUPS.values() for c in cats):
|
||||
cat = rated_with_radius.filter(pl.col("category") == category)
|
||||
cat_xy = _project_lat_lng_km(
|
||||
cat["lat"].to_numpy(), cat["lng"].to_numpy(), origin_lat
|
||||
)
|
||||
category_counts[category] = count_covering_catchments(
|
||||
pc_xy, pc_valid, cat_xy, cat["radius_km"].to_numpy(), len(arcgis)
|
||||
)
|
||||
print(f" {category}: {len(cat):,} schools")
|
||||
|
||||
result = pl.DataFrame(
|
||||
{
|
||||
"postcode": arcgis["postcode"],
|
||||
**{
|
||||
f"{group}_catchments": sum(category_counts[c] for c in categories)
|
||||
for group, categories in SCHOOL_GROUPS.items()
|
||||
},
|
||||
}
|
||||
)
|
||||
for group in SCHOOL_GROUPS:
|
||||
col = result[f"{group}_catchments"]
|
||||
print(f" {group}_catchments: mean {col.mean():.2f}, max {col.max()}")
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||
|
||||
if args.schools_output is not None:
|
||||
schools_out = rated_with_radius.select(
|
||||
"urn", "category", "phase", "cutoff_km", "filled", "radius_km", "lat", "lng"
|
||||
)
|
||||
args.schools_output.parent.mkdir(parents=True, exist_ok=True)
|
||||
schools_out.write_parquet(args.schools_output)
|
||||
print(f"Wrote {args.schools_output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,199 +0,0 @@
|
|||
"""Compute Ofsted-rated school proximity counts per postcode."""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
||||
SCHOOL_GROUPS = {
|
||||
"good_primary": ["good_primary", "outstanding_primary"],
|
||||
"good_secondary": ["good_secondary", "outstanding_secondary"],
|
||||
"outstanding_primary": ["outstanding_primary"],
|
||||
"outstanding_secondary": ["outstanding_secondary"],
|
||||
}
|
||||
|
||||
|
||||
# Age thresholds for deciding which phase(s) a school serves. A school serves
|
||||
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
|
||||
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
|
||||
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
|
||||
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
|
||||
# phase" labels such schools as just "Secondary", which previously hid them from
|
||||
# every postcode's primary-school count.
|
||||
PRIMARY_MAX_AGE = 10
|
||||
SECONDARY_MIN_AGE = 12
|
||||
|
||||
|
||||
def classify_good_plus_schools(
|
||||
ofsted: pl.DataFrame, open_urns: set[int] | None = None
|
||||
) -> pl.DataFrame:
|
||||
"""Label good+/outstanding primary & secondary schools for proximity counts.
|
||||
|
||||
Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
|
||||
``category`` rows per school, returning a ``(postcode, category)`` frame.
|
||||
|
||||
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
||||
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
Framework). A large and growing share of schools were last inspected under an
|
||||
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
||||
that column is null/"Not judged" for them even when they are demonstrably
|
||||
good — their status lives in "Ungraded inspection overall outcome" ("School
|
||||
remains Good"/"School remains Outstanding"). Filtering on the graded column
|
||||
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
|
||||
ungraded outcome, but ONLY when there is no usable graded result
|
||||
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
|
||||
|
||||
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
|
||||
(Concerns)" outcome signals inspectors found issues warranting an earlier
|
||||
graded re-inspection, so marketing it as a good+ school is misleading.
|
||||
|
||||
Phase assignment uses the statutory age range when available (so all-through
|
||||
and middle schools count toward BOTH primary and secondary), falling back to
|
||||
the coarse "Ofsted phase" label when age columns are absent. When
|
||||
``open_urns`` is given, schools whose URN is not in the current GIAS open
|
||||
register are dropped so closed/merged schools are not counted.
|
||||
"""
|
||||
# Cast to Utf8 so the string predicates below are well-defined even if a
|
||||
# column happens to be entirely null (read back as a Null dtype).
|
||||
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
||||
has_concern = ungraded.str.contains(r"\(Concerns\)")
|
||||
remains_outstanding = (
|
||||
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
|
||||
)
|
||||
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
|
||||
graded = (
|
||||
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
|
||||
.with_columns(
|
||||
pl.when(oeif.is_in(["1", "2"]))
|
||||
.then(oeif)
|
||||
.when(no_usable_grade & remains_outstanding)
|
||||
.then(pl.lit("1"))
|
||||
.when(no_usable_grade & remains_good)
|
||||
.then(pl.lit("2"))
|
||||
.otherwise(None)
|
||||
.alias("_ofsted_grade")
|
||||
)
|
||||
.filter(pl.col("_ofsted_grade").is_not_null())
|
||||
)
|
||||
|
||||
# Drop schools no longer open (closed/merged) when the GIAS open register is
|
||||
# provided, so stale Ofsted "latest inspection" rows are not counted.
|
||||
if open_urns is not None and "URN" in graded.columns:
|
||||
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
|
||||
|
||||
# Decide which phase(s) each school serves.
|
||||
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
|
||||
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
|
||||
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
|
||||
serves_primary = (
|
||||
pl.when(low.is_not_null())
|
||||
.then(low <= PRIMARY_MAX_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Primary")
|
||||
)
|
||||
serves_secondary = (
|
||||
pl.when(high.is_not_null())
|
||||
.then(high >= SECONDARY_MIN_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Secondary")
|
||||
)
|
||||
else:
|
||||
serves_primary = pl.col("Ofsted phase") == "Primary"
|
||||
serves_secondary = pl.col("Ofsted phase") == "Secondary"
|
||||
|
||||
graded = graded.with_columns(
|
||||
serves_primary.alias("_serves_primary"),
|
||||
serves_secondary.alias("_serves_secondary"),
|
||||
)
|
||||
|
||||
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
||||
# A school can yield up to two rows (primary and secondary).
|
||||
primary = graded.filter(pl.col("_serves_primary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
.alias("category")
|
||||
)
|
||||
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
.alias("category")
|
||||
)
|
||||
return pl.concat([primary, secondary]).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Count good+ and outstanding primary/secondary schools near each postcode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gias",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="GIAS open-school parquet; if given, only currently-open schools are counted",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
open_urns: set[int] | None = None
|
||||
if args.gias is not None:
|
||||
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
|
||||
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
|
||||
print(f"GIAS open register: {len(open_urns):,} open school URNs")
|
||||
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
|
||||
if ofsted.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
|
||||
print(f"Good+ schools: {len(ofsted):,}")
|
||||
print(
|
||||
"Outstanding schools: "
|
||||
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
|
||||
)
|
||||
|
||||
# Join with arcgis to get lat/lng for each school's postcode
|
||||
arcgis = pl.read_parquet(args.arcgis).select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
"lat",
|
||||
pl.col("long").alias("lng"),
|
||||
)
|
||||
|
||||
schools = ofsted.join(arcgis, on="postcode", how="inner")
|
||||
if schools.is_empty():
|
||||
raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
|
||||
print(f"Schools with coordinates: {len(schools):,}")
|
||||
|
||||
# Load all postcodes for proximity counting
|
||||
postcodes = arcgis.rename({"lng": "lon"})
|
||||
|
||||
counts_5km = count_pois_per_postcode(
|
||||
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
|
||||
)
|
||||
counts_2km = count_pois_per_postcode(
|
||||
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
|
||||
)
|
||||
|
||||
result = counts_5km.join(counts_2km, on="postcode")
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -8,6 +8,7 @@ import polars as pl
|
|||
|
||||
from pipeline.transform.join_epc_pp import (
|
||||
EPC_SOURCE_COLUMNS,
|
||||
_join_address_parts,
|
||||
_run,
|
||||
_scan_epc_certificates,
|
||||
)
|
||||
|
|
@ -111,6 +112,89 @@ def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
|
|||
assert df.schema["number_habitable_rooms"] == pl.Int16
|
||||
|
||||
|
||||
def test_join_address_parts_empty_string_components():
|
||||
# Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
|
||||
# concat_str(ignore_nulls=True) alone leaked the separator into the
|
||||
# display address (' 10 PALACE GREEN') and doubled it for empty middle
|
||||
# components. Empty/whitespace-only parts must contribute nothing.
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, " ", " FLAT 2"],
|
||||
"paon": ["10", "10", "", "82", "", None, "10", "11 "],
|
||||
"street": [
|
||||
"PALACE GREEN",
|
||||
"HIGH STREET",
|
||||
"HIGH STREET",
|
||||
"",
|
||||
"",
|
||||
None,
|
||||
"PALACE GREEN",
|
||||
"STATION ROAD",
|
||||
],
|
||||
}
|
||||
)
|
||||
out = df.select(
|
||||
_join_address_parts("saon", "paon", "street").alias("address")
|
||||
).get_column("address")
|
||||
|
||||
assert out.to_list() == [
|
||||
"10 PALACE GREEN", # empty saon -> no leading space
|
||||
"FLAT 1 10 HIGH STREET", # normal three-part address is unchanged
|
||||
"FLAT 1 HIGH STREET", # empty middle component -> no double space
|
||||
"FLAT 21 82", # empty street -> no trailing space
|
||||
None, # all-empty -> null, not whitespace junk
|
||||
None, # all-null -> null
|
||||
"10 PALACE GREEN", # whitespace-only component treated as empty
|
||||
"FLAT 2 11 STATION ROAD", # per-component padding is stripped
|
||||
]
|
||||
# Invariant: every produced address is trimmed and single-spaced.
|
||||
produced = out.drop_nulls()
|
||||
assert produced.str.starts_with(" ").sum() == 0
|
||||
assert produced.str.ends_with(" ").sum() == 0
|
||||
assert produced.str.contains(" ", literal=True).sum() == 0
|
||||
|
||||
|
||||
def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
|
||||
# Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
|
||||
# published pp_address must not inherit a leading separator from it.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerow(_row())
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000],
|
||||
"date_of_transfer": [date(2024, 2, 3)],
|
||||
"property_type": ["T"],
|
||||
"postcode": ["AA1 1AA"],
|
||||
"paon": ["1"],
|
||||
"saon": [""],
|
||||
"street": ["Example Street"],
|
||||
"locality": [""],
|
||||
"town_city": ["Exampletown"],
|
||||
"duration": ["F"],
|
||||
"old_new": ["N"],
|
||||
"ppd_category": ["A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
# No leading space, and the clean address still matches its EPC record.
|
||||
assert df.select("pp_address", "epc_address").to_dicts() == [
|
||||
{"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
|
||||
]
|
||||
|
||||
|
||||
def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
|
|
|
|||
|
|
@ -304,7 +304,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
|
|||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
school_catchments=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame(
|
||||
|
|
@ -362,7 +362,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
|
|||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
school_catchments=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=broadband,
|
||||
|
|
@ -1057,7 +1057,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
school_catchments=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame(
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.poi_proximity import (
|
||||
GREENSPACE_PARK_FUNCTIONS,
|
||||
POI_GROUPS_2KM,
|
||||
_build_poi_category_groups,
|
||||
_dynamic_poi_metric_renames,
|
||||
_greenspace_count_frame,
|
||||
_groceries_categories,
|
||||
)
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
|
@ -88,3 +90,84 @@ def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
|
|||
"parks_2km": "Number of amenities (Park) within 2km",
|
||||
"parks_5km": "Number of amenities (Park) within 5km",
|
||||
}
|
||||
|
||||
|
||||
def test_groceries_categories_exclude_speciality_food_retail() -> None:
|
||||
"""The static groceries metric must not count bakeries/butchers/delis/
|
||||
off-licences (speciality retail, ~a third of the group), while keeping
|
||||
Supermarket, Convenience Store, Greengrocer and GEOLYTIX brands."""
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"category": [
|
||||
"Tesco",
|
||||
"Supermarket",
|
||||
"Convenience Store",
|
||||
"Greengrocer",
|
||||
"Bakery",
|
||||
"Butcher & Fishmonger",
|
||||
"Deli & Specialty",
|
||||
"Off-Licence",
|
||||
"Café",
|
||||
],
|
||||
"group": ["Groceries"] * 8 + ["Leisure"],
|
||||
"lat": [51.5] * 9,
|
||||
"lng": [-0.1] * 9,
|
||||
}
|
||||
)
|
||||
|
||||
assert _groceries_categories(pois) == [
|
||||
"Convenience Store",
|
||||
"Greengrocer",
|
||||
"Supermarket",
|
||||
"Tesco",
|
||||
]
|
||||
|
||||
|
||||
def test_park_group_excludes_playgrounds_and_play_space() -> None:
|
||||
# "Play Space" (playgrounds) must not count as a Park; Public Park Or
|
||||
# Garden and Playing Field (open recreation grounds) are in scope.
|
||||
assert GREENSPACE_PARK_FUNCTIONS == {
|
||||
"parks": ["Public Park Or Garden", "Playing Field"]
|
||||
}
|
||||
|
||||
|
||||
def test_greenspace_count_frame_collapses_to_one_row_per_site() -> None:
|
||||
# Three gates of one park (with a site centroid), one gate of another park
|
||||
# without a centroid, and one centroid-fallback row with a null site_id.
|
||||
greenspace = pl.DataFrame(
|
||||
{
|
||||
"lat": [51.50, 51.51, 51.52, 53.0, 54.0],
|
||||
"lng": [-0.10, -0.11, -0.12, -2.0, -3.0],
|
||||
"category": ["Public Park Or Garden"] * 3
|
||||
+ ["Playing Field", "Public Park Or Garden"],
|
||||
"site_id": ["site-a", "site-a", "site-a", "site-b", None],
|
||||
"site_lat": [51.505, 51.505, 51.505, None, None],
|
||||
"site_lng": [-0.105, -0.105, -0.105, None, None],
|
||||
}
|
||||
)
|
||||
|
||||
result = _greenspace_count_frame(greenspace).sort("lat")
|
||||
|
||||
# One row per site (site-a collapses 3 → 1), null-site rows preserved.
|
||||
assert result.height == 3
|
||||
site_a = result.filter(pl.col("site_id") == "site-a")
|
||||
# The representative point is the site centroid…
|
||||
assert site_a["lat"].to_list() == [51.505]
|
||||
assert site_a["lng"].to_list() == [-0.105]
|
||||
# …or the first access point when no centroid is available.
|
||||
site_b = result.filter(pl.col("site_id") == "site-b")
|
||||
assert site_b["lat"].to_list() == [53.0]
|
||||
|
||||
|
||||
def test_greenspace_count_frame_passes_legacy_parquet_through() -> None:
|
||||
# The shipped parquet predates the site_id column; counting must not crash
|
||||
# (it keeps the old access-point grain until regenerated).
|
||||
legacy = pl.DataFrame(
|
||||
{
|
||||
"lat": [51.50, 51.51],
|
||||
"lng": [-0.10, -0.11],
|
||||
"category": ["Public Park Or Garden", "Play Space"],
|
||||
}
|
||||
)
|
||||
|
||||
assert _greenspace_count_frame(legacy).equals(legacy)
|
||||
|
|
|
|||
354
pipeline/transform/test_school_catchments.py
Normal file
354
pipeline/transform/test_school_catchments.py
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.school_catchments import (
|
||||
capacity_fill_radii,
|
||||
children_per_postcode,
|
||||
classify_good_plus_schools,
|
||||
count_covering_catchments,
|
||||
equilibrium_cutoffs,
|
||||
phase_intakes,
|
||||
school_preference_bonuses,
|
||||
)
|
||||
|
||||
|
||||
def _school(phase, oeif, ungraded, urn=100000):
|
||||
return {
|
||||
"URN": urn,
|
||||
"Postcode": "AA1 1AA",
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": ungraded,
|
||||
}
|
||||
|
||||
|
||||
def _classify(rows):
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows))
|
||||
return {(r["urn"], r["category"]) for r in result.to_dicts()}
|
||||
|
||||
|
||||
def test_legacy_oeif_grades_1_and_2_are_kept():
|
||||
rows = [
|
||||
_school("Primary", "1", None, 1),
|
||||
_school("Primary", "2", None, 2),
|
||||
_school("Secondary", "1", None, 3),
|
||||
_school("Secondary", "2", None, 4),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
(1, "outstanding_primary"),
|
||||
(2, "good_primary"),
|
||||
(3, "outstanding_secondary"),
|
||||
(4, "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_grades_3_and_4_are_excluded():
|
||||
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
||||
# Null and "Not judged" OEIF fall back to the ungraded outcome.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good", 1),
|
||||
_school("Secondary", "Not judged", "School remains Outstanding", 2),
|
||||
# "(Improving)" is still good+ ...
|
||||
_school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
(1, "good_primary"),
|
||||
(2, "outstanding_secondary"),
|
||||
(3, "good_primary"),
|
||||
}
|
||||
|
||||
|
||||
def test_ungraded_concerns_are_not_good_plus():
|
||||
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
|
||||
# must NOT be counted as good+ schools.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
|
||||
_school(
|
||||
"Secondary",
|
||||
None,
|
||||
"School remains Outstanding (Concerns) - S5 Next",
|
||||
2,
|
||||
),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_non_good_outcomes_are_excluded():
|
||||
rows = [
|
||||
_school("Primary", None, "Some aspects not as strong"),
|
||||
_school("Primary", None, "Standards maintained"),
|
||||
_school("Primary", None, None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
|
||||
# A real grade 3 must not be promoted by an ungraded "remains Good".
|
||||
rows = [_school("Primary", "3", "School remains Good")]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_non_primary_secondary_phases_excluded():
|
||||
rows = [
|
||||
_school("Nursery", "1", None),
|
||||
_school("Not applicable", "2", None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def _aged_school(phase, oeif, low, high, urn=100000):
|
||||
return {
|
||||
"URN": urn,
|
||||
"Postcode": "AA1 1AA",
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": None,
|
||||
"Statutory lowest age": low,
|
||||
"Statutory highest age": high,
|
||||
}
|
||||
|
||||
|
||||
def test_all_through_school_counts_toward_both_primary_and_secondary():
|
||||
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
|
||||
# serves primary-age children too, so it must count in BOTH metrics.
|
||||
rows = [_aged_school("Secondary", "2", 3, 18, 1)]
|
||||
assert _classify(rows) == {
|
||||
(1, "good_primary"),
|
||||
(1, "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_age_ranges_assign_single_phase_for_standard_schools():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, 1), # primary only
|
||||
_aged_school("Secondary", "2", 11, 16, 2), # secondary only
|
||||
_aged_school("Secondary", "1", 9, 13, 3), # middle -> both
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
(1, "outstanding_primary"),
|
||||
(2, "good_secondary"),
|
||||
(3, "outstanding_primary"),
|
||||
(3, "outstanding_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_closed_schools_excluded_when_open_register_given():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, 111),
|
||||
_aged_school("Secondary", "2", 11, 16, 222),
|
||||
]
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
|
||||
pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
|
||||
# URN 222 is not in the open register, so it is dropped.
|
||||
assert pairs == {(111, "outstanding_primary")}
|
||||
|
||||
|
||||
def _gias_row(
|
||||
urn,
|
||||
type_group="Academies",
|
||||
age_range="4–11",
|
||||
pupils=210,
|
||||
capacity=None,
|
||||
admissions_policy=None,
|
||||
):
|
||||
return {
|
||||
"urn": urn,
|
||||
"name": f"School {urn}",
|
||||
"lat": 51.5,
|
||||
"lng": -0.1,
|
||||
"type_group": type_group,
|
||||
"age_range": age_range,
|
||||
"pupils": pupils,
|
||||
"capacity": capacity,
|
||||
"admissions_policy": admissions_policy,
|
||||
}
|
||||
|
||||
|
||||
def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
|
||||
intakes = phase_intakes(
|
||||
pl.DataFrame(
|
||||
[
|
||||
# 4-11 = cohorts 4..10, all 7 primary: full fill target.
|
||||
_gias_row(1, age_range="4–11", pupils=210),
|
||||
# 11-16 = cohorts 11..15, all 5 secondary.
|
||||
_gias_row(2, age_range="11–16", pupils=500),
|
||||
# 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
|
||||
# gets 7 of 7.5 cohort weights.
|
||||
_gias_row(3, age_range="3–11", pupils=240),
|
||||
# All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
|
||||
_gias_row(4, age_range="4–16", pupils=1200),
|
||||
# 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
|
||||
# secondary gets 5 of 6.2 cohort weights.
|
||||
_gias_row(5, age_range="11–18", pupils=1240),
|
||||
]
|
||||
)
|
||||
).sort("urn")
|
||||
assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
|
||||
assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
|
||||
|
||||
|
||||
def test_phase_intakes_excludes_non_state_and_selective_schools():
|
||||
intakes = phase_intakes(
|
||||
pl.DataFrame(
|
||||
[
|
||||
_gias_row(1, type_group="Independent schools"),
|
||||
_gias_row(2, type_group="Special schools"),
|
||||
_gias_row(3, type_group="Welsh schools"),
|
||||
# Grammar school intakes are test-based and region-wide; a
|
||||
# distance catchment would be fabricated.
|
||||
_gias_row(4, admissions_policy="Selective"),
|
||||
_gias_row(5, pupils=None, capacity=300),
|
||||
_gias_row(6, pupils=None, capacity=None), # no usable headcount
|
||||
_gias_row(7, age_range=None), # no parsable cohorts
|
||||
# Over-full school keeps its demonstrated size.
|
||||
_gias_row(8, pupils=350, capacity=300),
|
||||
_gias_row(9, admissions_policy="Non-selective"),
|
||||
]
|
||||
)
|
||||
).sort("urn")
|
||||
assert intakes["urn"].to_list() == [5, 8, 9]
|
||||
assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
|
||||
|
||||
|
||||
def test_school_preference_bonuses_follow_derived_grade():
|
||||
rows = [
|
||||
{**_school("Primary", "1", None, 1)},
|
||||
{**_school("Primary", "2", None, 2)},
|
||||
{**_school("Primary", "3", None, 3)},
|
||||
{**_school("Primary", "4", None, 4)},
|
||||
{**_school("Primary", None, "Some aspects not as strong", 5)}, # unrated
|
||||
{**_school("Primary", "Not judged", "School remains Good", 6)},
|
||||
]
|
||||
bonuses = dict(
|
||||
school_preference_bonuses(
|
||||
pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
|
||||
).iter_rows()
|
||||
)
|
||||
assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
|
||||
|
||||
|
||||
def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
|
||||
postcodes = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
|
||||
"lat": [51.5, 51.5, 52.0],
|
||||
"lng": [-0.1, -0.1, -0.2],
|
||||
"lsoa21cd": ["E01000001", "E01000001", "E01000002"],
|
||||
}
|
||||
)
|
||||
lsoa_children = pl.DataFrame(
|
||||
{
|
||||
"lsoa21": ["E01000001", "E01000002"],
|
||||
"aged_0_4": [100, 30],
|
||||
"aged_5_9": [100, 10],
|
||||
"aged_10_14": [100, 20],
|
||||
"aged_15_19": [100, 40],
|
||||
}
|
||||
)
|
||||
result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
|
||||
# Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
|
||||
# the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
|
||||
assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
|
||||
# Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
|
||||
assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
|
||||
|
||||
|
||||
def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
|
||||
# One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
|
||||
# each. The two nearest postcodes exactly fill it, so the cutoff is the
|
||||
# marginal admitted child's distance and the 3km postcode is shut out.
|
||||
cutoffs = equilibrium_cutoffs(
|
||||
np.array([[0.0, 0.0]]),
|
||||
np.array([10.0]),
|
||||
np.array([0.0]),
|
||||
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
|
||||
np.array([5.0, 5.0, 5.0]),
|
||||
tau_km=0.0,
|
||||
)
|
||||
assert cutoffs.tolist() == [2.0]
|
||||
|
||||
|
||||
def test_equilibrium_rejected_demand_cascades_to_next_school():
|
||||
# School A (5 places) at the origin, school B (5 places) at 10km.
|
||||
# P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
|
||||
# with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
|
||||
# exceeds its target, so it keeps no binding cutoff.
|
||||
cutoffs = equilibrium_cutoffs(
|
||||
np.array([[0.0, 0.0], [10.0, 0.0]]),
|
||||
np.array([5.0, 5.0]),
|
||||
np.array([0.0, 0.0]),
|
||||
np.array([[1.0, 0.0], [1.5, 0.0]]),
|
||||
np.array([5.0, 5.0]),
|
||||
tau_km=0.0,
|
||||
)
|
||||
assert cutoffs[0] == 1.0
|
||||
assert np.isinf(cutoffs[1])
|
||||
|
||||
|
||||
def test_equilibrium_preference_bonus_steers_demand_to_better_school():
|
||||
# Two schools equidistant from the only postcode; school A is rated
|
||||
# better (0.5km bonus) so all children choose it; B attracts nobody.
|
||||
cutoffs = equilibrium_cutoffs(
|
||||
np.array([[0.0, 0.0], [2.0, 0.0]]),
|
||||
np.array([5.0, 5.0]),
|
||||
np.array([0.5, 0.0]),
|
||||
np.array([[1.0, 0.0]]),
|
||||
np.array([10.0]),
|
||||
tau_km=0.0,
|
||||
)
|
||||
assert cutoffs[0] == 1.0
|
||||
assert np.isinf(cutoffs[1])
|
||||
|
||||
|
||||
def test_equilibrium_logit_choice_smears_demand_across_schools():
|
||||
# With a positive temperature some families prefer the further school, so
|
||||
# both schools receive applications: the near school still fills and keeps
|
||||
# a binding cutoff, and the far school now attracts mass it would never
|
||||
# see under deterministic choice.
|
||||
cutoffs = equilibrium_cutoffs(
|
||||
np.array([[0.0, 0.0], [2.0, 0.0]]),
|
||||
np.array([4.0, 4.0]),
|
||||
np.array([0.0, 0.0]),
|
||||
np.array([[1.0, 0.0]]),
|
||||
np.array([10.0]),
|
||||
tau_km=1.0,
|
||||
)
|
||||
# Each school gets half the 10 children (equidistant, equal utility),
|
||||
# exceeding both fill targets: both cutoffs bind at the postcode.
|
||||
assert cutoffs.tolist() == [1.0, 1.0]
|
||||
|
||||
|
||||
def test_capacity_fill_radii_covers_fill_target_population():
|
||||
# Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
|
||||
# cumulate past the target at 2km. A school needing more children than
|
||||
# exist within the cap keeps the cap.
|
||||
radii = capacity_fill_radii(
|
||||
np.array([[0.0, 0.0], [0.0, 0.0]]),
|
||||
np.array([6.0, 1000.0]),
|
||||
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
|
||||
np.array([5.0, 5.0, 5.0]),
|
||||
max_radius_km=25.0,
|
||||
)
|
||||
assert radii.tolist() == [2.0, 25.0]
|
||||
|
||||
|
||||
def test_count_covering_catchments_respects_radius_and_validity():
|
||||
pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
|
||||
pc_valid = np.array([True, True, True, False])
|
||||
school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
|
||||
radii = np.array([4.0, 1.5])
|
||||
counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
|
||||
# pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
|
||||
# pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
|
||||
assert counts.tolist() == [1, 2, 0, 0]
|
||||
|
||||
|
||||
def test_count_covering_catchments_empty_schools():
|
||||
counts = count_covering_catchments(
|
||||
np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
|
||||
)
|
||||
assert counts.tolist() == [0, 0]
|
||||
|
|
@ -1,139 +0,0 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.school_proximity import classify_good_plus_schools
|
||||
|
||||
|
||||
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
|
||||
return {
|
||||
"Postcode": postcode,
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": ungraded,
|
||||
}
|
||||
|
||||
|
||||
def _classify(rows):
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows))
|
||||
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
|
||||
|
||||
|
||||
def test_legacy_oeif_grades_1_and_2_are_kept():
|
||||
rows = [
|
||||
_school("Primary", "1", None, "AA1 1AA"),
|
||||
_school("Primary", "2", None, "AA1 1AB"),
|
||||
_school("Secondary", "1", None, "AA1 1AC"),
|
||||
_school("Secondary", "2", None, "AA1 1AD"),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "outstanding_primary"),
|
||||
("AA1 1AB", "good_primary"),
|
||||
("AA1 1AC", "outstanding_secondary"),
|
||||
("AA1 1AD", "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_grades_3_and_4_are_excluded():
|
||||
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
||||
# Null and "Not judged" OEIF fall back to the ungraded outcome.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good", "AA1 1AA"),
|
||||
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
|
||||
# "(Improving)" is still good+ ...
|
||||
_school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AB", "outstanding_secondary"),
|
||||
("AA1 1AE", "good_primary"),
|
||||
}
|
||||
|
||||
|
||||
def test_ungraded_concerns_are_not_good_plus():
|
||||
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
|
||||
# must NOT be counted as good+ schools.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
|
||||
_school(
|
||||
"Secondary",
|
||||
None,
|
||||
"School remains Outstanding (Concerns) - S5 Next",
|
||||
"AA1 1AD",
|
||||
),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_non_good_outcomes_are_excluded():
|
||||
rows = [
|
||||
_school("Primary", None, "Some aspects not as strong"),
|
||||
_school("Primary", None, "Standards maintained"),
|
||||
_school("Primary", None, None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
|
||||
# A real grade 3 must not be promoted by an ungraded "remains Good".
|
||||
rows = [_school("Primary", "3", "School remains Good")]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_non_primary_secondary_phases_excluded():
|
||||
rows = [
|
||||
_school("Nursery", "1", None),
|
||||
_school("Not applicable", "2", None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
|
||||
return {
|
||||
"Postcode": postcode,
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": None,
|
||||
"URN": 100000,
|
||||
"Statutory lowest age": low,
|
||||
"Statutory highest age": high,
|
||||
}
|
||||
|
||||
|
||||
def test_all_through_school_counts_toward_both_primary_and_secondary():
|
||||
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
|
||||
# serves primary-age children too, so it must count in BOTH metrics.
|
||||
rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AA", "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_age_ranges_assign_single_phase_for_standard_schools():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, "AA1 1AA"), # primary only
|
||||
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"), # secondary only
|
||||
_aged_school("Secondary", "1", 9, 13, "AA1 1AC"), # middle -> both
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "outstanding_primary"),
|
||||
("AA1 1AB", "good_secondary"),
|
||||
("AA1 1AC", "outstanding_primary"),
|
||||
("AA1 1AC", "outstanding_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_closed_schools_excluded_when_open_register_given():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, "AA1 1AA"),
|
||||
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
|
||||
]
|
||||
rows[0]["URN"] = 111
|
||||
rows[1]["URN"] = 222
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
|
||||
pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
|
||||
# URN 222 is not in the open register, so it is dropped.
|
||||
assert pairs == {("AA1 1AA", "outstanding_primary")}
|
||||
|
|
@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
|
|||
assert n2_grocery.height == 1
|
||||
|
||||
|
||||
def test_transform_drops_miscategorised_tags(tmp_path):
|
||||
# Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
|
||||
# slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
|
||||
# alternative medicine), Hospital & Clinic (untyped healthcare/yes),
|
||||
# Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
|
||||
# apparatus). They must be dropped entirely.
|
||||
dropped = [
|
||||
"amenity/bicycle_rental",
|
||||
"amenity/boat_rental",
|
||||
"leisure/marina",
|
||||
"leisure/slipway",
|
||||
"tourism/artwork",
|
||||
"healthcare/yes",
|
||||
"healthcare/alternative",
|
||||
"shop/herbalist",
|
||||
"shop/health",
|
||||
"amenity/fountain",
|
||||
"amenity/courthouse",
|
||||
"leisure/fitness_station",
|
||||
]
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": [f"n{i}" for i in range(len(dropped))],
|
||||
"name": [f"POI {i}" for i in range(len(dropped))],
|
||||
"category": dropped,
|
||||
"lat": [51.50] * len(dropped),
|
||||
"lng": [-0.10] * len(dropped),
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
|
||||
|
||||
|
||||
def test_transform_splits_hospital_and_clinic(tmp_path):
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n2", "n3"],
|
||||
"name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
|
||||
"category": [
|
||||
"amenity/hospital",
|
||||
"amenity/clinic",
|
||||
"healthcare/clinic",
|
||||
],
|
||||
"lat": [51.50, 51.51, 51.52],
|
||||
"lng": [-0.10, -0.11, -0.12],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
|
||||
assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
|
||||
assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
|
||||
assert "Hospital & Clinic" not in out["category"].to_list()
|
||||
|
||||
|
||||
def test_transform_maps_chalet_to_hotel(tmp_path):
|
||||
# Holiday-let chalets are accommodation, not Tourist Attractions.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1"],
|
||||
"name": ["Seaview Chalet"],
|
||||
"category": ["tourism/chalet"],
|
||||
"lat": [51.50],
|
||||
"lng": [-0.10],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
|
||||
|
||||
|
||||
def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
|
||||
# leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
|
||||
# unnamed (anonymous tracks/gallops/fishing spots); only named public
|
||||
# facilities survive as a Sports Centre.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n2", "n3", "n4"],
|
||||
"name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
|
||||
"category": [
|
||||
"leisure/track",
|
||||
"leisure/fishing",
|
||||
"leisure/track",
|
||||
"leisure/horse_riding",
|
||||
],
|
||||
"lat": [51.50, 51.51, 51.52, 51.53],
|
||||
"lng": [-0.10, -0.11, -0.12, -0.13],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
|
||||
named = out.filter(pl.col("id").is_in(["n3", "n4"]))
|
||||
assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
|
||||
|
||||
|
||||
def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
|
||||
# NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
|
||||
# flow through with the Public Transport group and its own emoji.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1"],
|
||||
"name": ["A Cafe"],
|
||||
"category": ["amenity/cafe"],
|
||||
"lat": [51.50],
|
||||
"lng": [-0.10],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
pl.DataFrame(
|
||||
{
|
||||
"id": ["naptan-1", "naptan-2"],
|
||||
"name": ["Test Rail Station", "Weaste"],
|
||||
"category": ["Rail station", "Tram & Metro stop"],
|
||||
"lat": [51.51, 51.52],
|
||||
"lng": [-0.13, -0.14],
|
||||
}
|
||||
).write_parquet(inputs["naptan_path"])
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
tram = out.filter(pl.col("category") == "Tram & Metro stop")
|
||||
assert tram.height == 1
|
||||
assert tram["group"].to_list() == ["Public Transport"]
|
||||
assert tram["emoji"].to_list() == ["🚊"]
|
||||
|
||||
|
||||
def test_transform_output_unique_per_id_category(tmp_path):
|
||||
# Soundness: the full transform() output has at most one row per
|
||||
# (id, category) overall, across every source.
|
||||
|
|
|
|||
|
|
@ -86,6 +86,28 @@ DROP_CATEGORIES = {
|
|||
"amenity/water_point",
|
||||
"amenity/watering_place",
|
||||
"amenity/weighbridge",
|
||||
# Boating/cycle-hire infrastructure formerly miscategorised as
|
||||
# "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
|
||||
# ramps and moorings are not entertainment venues.
|
||||
"amenity/bicycle_rental",
|
||||
"amenity/boat_rental",
|
||||
"leisure/marina",
|
||||
"leisure/slipway",
|
||||
# Public art (statues, murals, village signs) formerly 93% of "Gallery".
|
||||
"tourism/artwork",
|
||||
# Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
|
||||
# "Gym & Fitness".
|
||||
"leisure/fitness_station",
|
||||
# Untyped healthcare rows and non-pharmacy health shops formerly bucketed
|
||||
# under "Hospital & Clinic" / "Pharmacy".
|
||||
"healthcare/yes",
|
||||
"healthcare/alternative",
|
||||
"shop/herbalist",
|
||||
"shop/health",
|
||||
# Street fountains and courthouses formerly bucketed as
|
||||
# "Tourist Attraction".
|
||||
"amenity/fountain",
|
||||
"amenity/courthouse",
|
||||
# Niche amenities not useful for home buyers
|
||||
"amenity/animal_boarding",
|
||||
"amenity/animal_breeding",
|
||||
|
|
@ -373,10 +395,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"leisure/tanning_salon",
|
||||
"shop/amusements",
|
||||
"tourism/theme_park",
|
||||
"amenity/bicycle_rental",
|
||||
"amenity/boat_rental",
|
||||
"leisure/marina",
|
||||
"leisure/slipway",
|
||||
# bicycle_rental/boat_rental/marina/slipway used to live here and
|
||||
# made up ~46% of the bucket (cycle-hire docks, boat ramps); they
|
||||
# are infrastructure, not entertainment venues — see DROP_CATEGORIES.
|
||||
"leisure/hackerspace",
|
||||
"leisure/yes",
|
||||
],
|
||||
|
|
@ -699,7 +720,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"🏋️",
|
||||
[
|
||||
"leisure/fitness_centre",
|
||||
"leisure/fitness_station",
|
||||
# leisure/fitness_station (outdoor pull-up bars / trim-trail
|
||||
# apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
|
||||
"amenity/dojo",
|
||||
"amenity/dancing_school",
|
||||
],
|
||||
|
|
@ -825,28 +847,37 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"amenity/pharmacy",
|
||||
"healthcare/pharmacy",
|
||||
"shop/chemist",
|
||||
"shop/herbalist",
|
||||
"shop/health",
|
||||
"healthcare/alternative",
|
||||
# healthcare/alternative, shop/herbalist and shop/health (homeopaths,
|
||||
# herbalists, generic "health" shops) are not dispensing pharmacies
|
||||
# — see DROP_CATEGORIES.
|
||||
],
|
||||
),
|
||||
# "Hospital & Clinic" used to be one bucket; an actual hospital and a small
|
||||
# clinic are very different amenities for a homebuyer, so they are split.
|
||||
(
|
||||
"Health",
|
||||
"Hospital",
|
||||
"🏥",
|
||||
[
|
||||
"amenity/hospital",
|
||||
"healthcare/hospital",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Health",
|
||||
"Hospital & Clinic",
|
||||
"🏥",
|
||||
"Clinic",
|
||||
"🩺",
|
||||
[
|
||||
"amenity/hospital",
|
||||
"amenity/clinic",
|
||||
"amenity/health_centre",
|
||||
"healthcare/blood_donation",
|
||||
"healthcare/hospital",
|
||||
"healthcare/centre",
|
||||
"healthcare/clinic",
|
||||
"office/healthcare",
|
||||
"healthcare/laboratory",
|
||||
"healthcare/rehabilitation",
|
||||
"healthcare/vaccination_centre",
|
||||
"healthcare/yes",
|
||||
# healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -917,7 +948,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"🖼️",
|
||||
[
|
||||
"tourism/gallery",
|
||||
"tourism/artwork",
|
||||
# tourism/artwork (statues, murals, village signs) was 93% of this
|
||||
# bucket and is not a visitable gallery — see DROP_CATEGORIES.
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -961,9 +993,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
[
|
||||
"tourism/attraction",
|
||||
"tourism/aquarium",
|
||||
"amenity/fountain",
|
||||
"amenity/courthouse",
|
||||
"tourism/chalet",
|
||||
# amenity/fountain (street furniture) and amenity/courthouse are
|
||||
# dropped; tourism/chalet (holiday lets) moved to "Hotel".
|
||||
],
|
||||
),
|
||||
# Note: schools come from the GIAS register (see transform_gias_schools).
|
||||
|
|
@ -982,6 +1013,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"leisure/resort",
|
||||
"tourism/holiday_park",
|
||||
"tourism/self_catering",
|
||||
# Holiday-let chalets are accommodation, not tourist attractions
|
||||
# (where they previously sat).
|
||||
"tourism/chalet",
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -1162,6 +1196,11 @@ REQUIRE_NAME_CATEGORIES = {
|
|||
"leisure/practice_pitch",
|
||||
"leisure/swimming_pool",
|
||||
"leisure/paddling_pool",
|
||||
# 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
|
||||
# fishing spots; only named public facilities count as a Sports Centre.
|
||||
"leisure/track",
|
||||
"leisure/horse_riding",
|
||||
"leisure/fishing",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1181,6 +1220,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
|||
"Bus station": "🚌",
|
||||
"Taxi rank": "🚕",
|
||||
"Tube station": "🚇",
|
||||
"Tram & Metro stop": "🚊",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1438,9 +1478,9 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
|||
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
|
||||
report-card framework) we fall back to "Ungraded inspection overall outcome"
|
||||
so genuinely good/outstanding schools aren't dropped — mirroring
|
||||
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
|
||||
school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
|
||||
grade_col = pl.col("Latest OEIF overall effectiveness")
|
||||
# See school_proximity: the ungraded outcome carries "School remains Good"/
|
||||
# See school_catchments: the ungraded outcome carries "School remains Good"/
|
||||
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
|
||||
# suffixes) when the graded column is null/"Not judged".
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue