Rerun data pipelines

This commit is contained in:
Andras Schmelczer 2026-05-10 14:49:53 +01:00
parent 4c95815dc8
commit fc10381692
27 changed files with 2143 additions and 215 deletions

View file

@ -1,12 +1,32 @@
import argparse
import re
from pathlib import Path
import polars as pl
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
csvs = sorted(crime_dir.rglob("*.csv"))
street_csvs = [path for path in csvs if STREET_CRIME_CSV_RE.fullmatch(path.name)]
return street_csvs, len(csvs) - len(street_csvs)
def transform_crime(crime_dir: Path, output_path: Path) -> None:
csvs = sorted(crime_dir.rglob("*.csv"))
print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
if not csvs:
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
month_count = len({path.parent.name for path in csvs})
print(
f"Found {len(csvs)} street crime CSV files across {month_count} months"
+ (
f" (ignored {ignored_csv_count} non-street CSVs)"
if ignored_csv_count
else ""
)
)
df = pl.scan_csv(
csvs,

View file

@ -1,6 +1,15 @@
import argparse
import polars as pl
import csv
import io
import tempfile
import zipfile
from pathlib import Path
import polars as pl
import pyarrow as pa
import pyarrow.csv as pa_csv
import pyarrow.parquet as pq
from ..utils import fuzzy_join_on_postcode
@ -8,12 +17,168 @@ pl.Config.set_tbl_cols(-1)
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
MIN_PRICE = 50_000
EPC_SOURCE_COLUMNS = [
"address",
"postcode",
"current_energy_rating",
"potential_energy_rating",
"property_type",
"built_form",
"inspection_date",
"total_floor_area",
"number_habitable_rooms",
"floor_height",
"construction_age_band",
"tenure",
]
def _normalise_csv_columns(columns: list[str]) -> list[str]:
return [column.strip().lower() for column in columns]
def _clean_string(column: str) -> pl.Expr:
stripped = pl.col(column).cast(pl.String).str.strip_chars()
return pl.when(stripped == "").then(None).otherwise(stripped)
def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
return _clean_string(column).cast(dtype, strict=False)
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
return (
raw.select(
_clean_string("address").alias("epc_address"),
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
_clean_string("current_energy_rating")
.str.to_uppercase()
.alias("current_energy_rating"),
_clean_string("potential_energy_rating")
.str.to_uppercase()
.alias("potential_energy_rating"),
_clean_string("property_type").alias("epc_property_type"),
_clean_string("built_form").alias("built_form"),
_clean_string("inspection_date").alias("inspection_date"),
_clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
_clean_number("number_habitable_rooms", pl.Int16).alias(
"number_habitable_rooms"
),
_clean_number("floor_height", pl.Float64).alias("floor_height"),
_clean_string("construction_age_band").alias("construction_age_band"),
_clean_string("tenure").alias("tenure"),
)
.filter(pl.col("epc_address").is_not_null())
.with_columns(
pl.when(pl.col("number_habitable_rooms") == 0)
.then(None)
.otherwise(pl.col("number_habitable_rooms"))
.alias("number_habitable_rooms"),
)
)
def _certificate_member_names(zip_file: zipfile.ZipFile) -> list[str]:
return sorted(
name
for name in zip_file.namelist()
if not name.endswith("/")
and Path(name).name.lower().startswith("certificates")
and name.lower().endswith(".csv")
)
def _read_zip_csv_header(zip_file: zipfile.ZipFile, member_name: str) -> list[str]:
with zip_file.open(member_name) as member:
text = io.TextIOWrapper(member, encoding="utf-8-sig", newline="")
try:
return next(csv.reader(text))
except StopIteration as exc:
raise ValueError(f"EPC CSV member is empty: {member_name}") from exc
def _source_columns_for_header(header: list[str]) -> list[str]:
columns_by_normalised_name = {
normalised: source
for source, normalised in zip(header, _normalise_csv_columns(header))
}
return [
columns_by_normalised_name.get(column, column) for column in EPC_SOURCE_COLUMNS
]
def _zip_certificates_to_parquet(zip_path: Path, output_path: Path) -> None:
schema = pa.schema((column, pa.string()) for column in EPC_SOURCE_COLUMNS)
writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd")
try:
try:
zip_file = zipfile.ZipFile(zip_path)
except zipfile.BadZipFile as exc:
raise ValueError(
f"{zip_path} is not a readable EPC zip archive; re-download "
"domestic-csv.zip and try again"
) from exc
with zip_file:
member_names = _certificate_member_names(zip_file)
if not member_names:
raise ValueError(f"No certificate CSV files found in {zip_path}")
for member_name in member_names:
print(f"Reading EPC certificates from {member_name}")
source_columns = _source_columns_for_header(
_read_zip_csv_header(zip_file, member_name)
)
convert_options = pa_csv.ConvertOptions(
include_columns=source_columns,
include_missing_columns=True,
column_types={
source_column: pa.string() for source_column in source_columns
},
strings_can_be_null=True,
)
read_options = pa_csv.ReadOptions(block_size=64 * 1024 * 1024)
with zip_file.open(member_name) as member:
reader = pa_csv.open_csv(
member,
read_options=read_options,
convert_options=convert_options,
)
while True:
try:
batch = reader.read_next_batch()
except StopIteration:
break
if batch.num_rows == 0:
continue
writer.write_batch(batch.rename_columns(EPC_SOURCE_COLUMNS))
finally:
writer.close()
def _scan_epc_certificates(epc_path: Path, temp_dir: Path) -> pl.LazyFrame:
if epc_path.suffix.lower() == ".zip":
parquet_path = temp_dir / "epc-certificates.parquet"
_zip_certificates_to_parquet(epc_path, parquet_path)
raw = pl.scan_parquet(parquet_path)
else:
raw = pl.scan_csv(
epc_path,
infer_schema=False,
with_column_names=_normalise_csv_columns,
)
return _select_epc_columns(raw)
def main():
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
parser.add_argument(
"--epc", type=Path, required=True, help="EPC certificates CSV file"
"--epc", type=Path, required=True, help="EPC certificates CSV file or zip"
)
parser.add_argument(
"--price-paid", type=Path, required=True, help="Price paid parquet file"
@ -23,74 +188,56 @@ def main():
)
args = parser.parse_args()
epc_base = (
pl.scan_csv(args.epc)
.select(
pl.col("ADDRESS").alias("epc_address"),
"POSTCODE",
"CURRENT_ENERGY_RATING",
"POTENTIAL_ENERGY_RATING",
pl.col("PROPERTY_TYPE").alias("epc_property_type"),
"BUILT_FORM",
"INSPECTION_DATE",
"TOTAL_FLOOR_AREA",
"NUMBER_HABITABLE_ROOMS",
"FLOOR_HEIGHT",
"CONSTRUCTION_AGE_BAND",
"TENURE",
)
.filter(pl.col("epc_address").is_not_null())
.with_columns(
pl.when(pl.col("NUMBER_HABITABLE_ROOMS") == 0)
.then(None)
.otherwise(pl.col("NUMBER_HABITABLE_ROOMS"))
.alias("NUMBER_HABITABLE_ROOMS"),
)
)
with tempfile.TemporaryDirectory(prefix="epc_certificates_") as tmpdir:
_run(args.epc, args.price_paid, args.output, Path(tmpdir))
def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Path):
epc_base = _scan_epc_certificates(epc_path, temp_dir)
# Dedup fork: keep latest certificate per property (existing logic)
epc = (
epc_base.sort("INSPECTION_DATE", descending=True)
.group_by("epc_address", "POSTCODE")
epc_base.sort("inspection_date", descending=True)
.group_by("epc_address", "epc_postcode")
.first()
.drop("TENURE")
.drop("tenure")
)
# Events fork: detect renovation events between consecutive certificates
# Collect eagerly because .over() window functions don't work in streaming
# engine (fuzzy_join.py:50 uses sink_parquet which requires streaming).
events = (
epc_base.sort("INSPECTION_DATE")
epc_base.sort("inspection_date")
.with_columns(
pl.col("CURRENT_ENERGY_RATING")
pl.col("current_energy_rating")
.replace_strict(RATING_RANK, default=None, return_dtype=pl.Int32)
.alias("_rating_rank"),
)
.with_columns(
pl.col("NUMBER_HABITABLE_ROOMS")
pl.col("number_habitable_rooms")
.shift(1)
.over("epc_address", "POSTCODE")
.over("epc_address", "epc_postcode")
.alias("_prev_rooms"),
pl.col("TOTAL_FLOOR_AREA")
pl.col("total_floor_area")
.shift(1)
.over("epc_address", "POSTCODE")
.over("epc_address", "epc_postcode")
.alias("_prev_area"),
pl.col("_rating_rank")
.shift(1)
.over("epc_address", "POSTCODE")
.over("epc_address", "epc_postcode")
.alias("_prev_rating_rank"),
)
.with_columns(
pl.when(
pl.col("NUMBER_HABITABLE_ROOMS").is_not_null()
pl.col("number_habitable_rooms").is_not_null()
& pl.col("_prev_rooms").is_not_null()
& (pl.col("NUMBER_HABITABLE_ROOMS") != pl.col("_prev_rooms"))
& (pl.col("number_habitable_rooms") != pl.col("_prev_rooms"))
)
.then(pl.lit("Remodelling"))
.when(
pl.col("TOTAL_FLOOR_AREA").is_not_null()
pl.col("total_floor_area").is_not_null()
& pl.col("_prev_area").is_not_null()
& (pl.col("TOTAL_FLOOR_AREA") > pl.col("_prev_area"))
& (pl.col("total_floor_area") > pl.col("_prev_area"))
)
.then(pl.lit("Extension"))
.when(
@ -104,13 +251,13 @@ def main():
)
.filter(pl.col("_event").is_not_null())
.with_columns(
pl.col("INSPECTION_DATE")
pl.col("inspection_date")
.cast(pl.String)
.str.slice(0, 4)
.cast(pl.Int32)
.alias("_event_year"),
)
.group_by("epc_address", "POSTCODE")
.group_by("epc_address", "epc_postcode")
.agg(
pl.struct(
pl.col("_event_year").alias("year"),
@ -128,8 +275,8 @@ def main():
# Social tenure fork: flag properties that were ever social housing
social_tenure = (
epc_base.filter(pl.col("TENURE").str.to_lowercase().str.contains("social"))
.select("epc_address", "POSTCODE")
epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
.select("epc_address", "epc_postcode")
.unique()
.with_columns(pl.lit("Yes").alias("was_council_house"))
.collect()
@ -140,12 +287,12 @@ def main():
epc = (
epc.join(
events.lazy(),
on=["epc_address", "POSTCODE"],
on=["epc_address", "epc_postcode"],
how="left",
)
.join(
social_tenure.lazy(),
on=["epc_address", "POSTCODE"],
on=["epc_address", "epc_postcode"],
how="left",
)
.with_columns(
@ -167,7 +314,7 @@ def main():
duration_map = {"F": "Freehold", "L": "Leasehold"}
price_paid = (
pl.scan_parquet(args.price_paid)
pl.scan_parquet(price_paid_path)
.select(
"price",
"date_of_transfer",
@ -219,9 +366,9 @@ def main():
left_address_col="pp_address",
right_address_col="epc_address",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
right_postcode_col="epc_postcode",
)
.drop("POSTCODE")
.drop("epc_postcode")
.collect(engine="streaming")
)
@ -236,7 +383,7 @@ def main():
# For new-builds (old_new == "Y"), use the first transaction date year as
# the exact construction date; otherwise fall back to the EPC age band.
epc_band_year = (
pl.col("CONSTRUCTION_AGE_BAND")
pl.col("construction_age_band")
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
@ -251,7 +398,7 @@ def main():
pl.when(is_new_build & transfer_year.is_not_null())
.then(transfer_year)
.otherwise(epc_band_year)
.alias("CONSTRUCTION_AGE_BAND"),
.alias("construction_age_band"),
pl.when(is_new_build & transfer_year.is_not_null())
.then(pl.lit(0, dtype=pl.UInt8))
.when(epc_band_year.is_not_null())
@ -263,8 +410,8 @@ def main():
joined = joined.rename({col: col.lower() for col in joined.columns})
print(joined.head())
joined.write_parquet(args.output)
print(f"Wrote {args.output}")
joined.write_parquet(output_path)
print(f"Wrote {output_path}")
if __name__ == "__main__":

View file

@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
_AREA_COLUMNS = [
"Postcode",
@ -76,6 +85,24 @@ _AREA_COLUMNS = [
]
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
descending_rank = pl.col(column).rank("average", descending=True)
return (
pl.when(pl.col(column).is_null())
.then(None)
.when(pl.col(column) == pl.col(column).min())
.then(100.0)
.when(pl.col(column) == pl.col(column).max())
.then(0.0)
.when(non_null_count > 1)
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
.otherwise(100.0)
.alias(column)
)
def _build(
epc_pp_path: Path,
arcgis_path: Path,
@ -134,20 +161,11 @@ def _build(
)
wide = wide.join(arcgis, on="postcode", how="left")
iod = pl.scan_parquet(iod_path)
iod = pl.scan_parquet(iod_path).with_columns(
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
# Invert deprivation scores so that higher values = less deprived (better)
iod_score_cols = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,

View file

@ -1,6 +1,8 @@
"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""
import argparse
import re
import unicodedata
from pathlib import Path
import polars as pl
@ -15,9 +17,25 @@ POI_GROUPS_2KM = {
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
}
# Groups for which to compute distance to nearest POI (from filtered POIs)
# Groups for which to compute distance to nearest POI (from filtered POIs).
# Keep `train_tube` for the existing backend feature; the individual POI
# distance filters below power the frontend dropdown.
DISTANCE_GROUPS = {
"train_tube": ["Tube station", "Rail station"],
"grocery_store": [
"Greengrocer",
"Supermarket",
"Convenience Store",
"Waitrose",
"Tesco",
],
"tube_station": ["Tube station"],
"rail_station": ["Rail station"],
"waitrose": ["Waitrose"],
"tesco": ["Tesco"],
"cafe": ["Café"],
"pub": ["Pub"],
"restaurant": ["Restaurant"],
}
# OS Open Greenspace function types used for park counts and distance calculation.
@ -27,6 +45,69 @@ GREENSPACE_PARK_FUNCTIONS = {
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
def _poi_category_slug(category: str) -> str:
ascii_text = (
unicodedata.normalize("NFKD", category)
.encode("ascii", "ignore")
.decode("ascii")
.lower()
)
slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
return slug or "poi"
def _build_poi_category_groups(
pois: pl.DataFrame,
) -> tuple[dict[str, list[str]], dict[str, str]]:
"""Build one proximity group for each POI category selected for filters."""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
categories = (
pois.group_by("group", "category")
.len()
.filter(
pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
| (
pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
)
)
.select("category")
.sort("category")
.to_series()
.to_list()
)
used_slugs: dict[str, int] = {}
groups: dict[str, list[str]] = {}
display_names: dict[str, str] = {}
for category in categories:
if not isinstance(category, str) or not category:
continue
base_slug = f"poi_{_poi_category_slug(category)}"
slug_count = used_slugs.get(base_slug, 0)
used_slugs[base_slug] = slug_count + 1
group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
groups[group_key] = [category]
display_names[group_key] = category
return groups, display_names
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
return renames
def main():
parser = argparse.ArgumentParser(
@ -56,12 +137,35 @@ def main():
)
pois = pl.read_parquet(args.pois)
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
# Count amenity POIs within 2km
counts_2km = count_pois_per_postcode(
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
)
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
# the selected public transport, grocery, and leisure categories.
dynamic_counts_2km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=2
)
dynamic_counts_5km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=5
)
dynamic_distances = min_distance_per_postcode(
postcodes, pois, groups=poi_category_groups
)
dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
dynamic_counts_2km = dynamic_counts_2km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
)
dynamic_counts_5km = dynamic_counts_5km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
)
dynamic_distances = dynamic_distances.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
)
# Distance to nearest train/tube station (from filtered POIs)
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
@ -77,6 +181,9 @@ def main():
# Join all results on postcode
result = (
counts_2km.join(distances, on="postcode")
.join(dynamic_counts_2km, on="postcode")
.join(dynamic_counts_5km, on="postcode")
.join(dynamic_distances, on="postcode")
.join(park_counts_1km, on="postcode")
.join(park_distances, on="postcode")
)

View file

@ -0,0 +1,47 @@
import polars as pl
from pipeline.transform.crime import find_street_crime_csvs, transform_crime
def test_find_street_crime_csvs_ignores_archive_sidecars(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
street = month_dir / "2024-01-test-force-street.csv"
street.touch()
(month_dir / "2024-01-test-force-outcomes.csv").touch()
(month_dir / "2024-01-test-force-stop-and-search.csv").touch()
(crime_dir / "notes.csv").touch()
csvs, ignored_count = find_street_crime_csvs(crime_dir)
assert csvs == [street]
assert ignored_count == 3
def test_transform_crime_reads_only_street_crime_csvs(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
(month_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
"1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,,No LSOA,Robbery,Under investigation,",
]
)
+ "\n"
)
(month_dir / "2024-01-test-force-outcomes.csv").write_text(
"Crime ID,Month,Reported by,Outcome type\n1,2024-01,Test Force,Charged\n"
)
output = tmp_path / "crime.parquet"
transform_crime(crime_dir, output)
result = pl.read_parquet(output).to_dicts()
assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 2.0}]

View file

@ -0,0 +1,174 @@
import csv
import io
import zipfile
from datetime import date
from pathlib import Path
import polars as pl
from pipeline.transform.join_epc_pp import (
EPC_SOURCE_COLUMNS,
_run,
_scan_epc_certificates,
)
def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
with path.open("w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
def _row(**overrides: str) -> dict[str, str]:
row = {
"address": "1 Example Street",
"postcode": " aa1 1aa ",
"current_energy_rating": "c",
"potential_energy_rating": "b",
"property_type": "House",
"built_form": "Mid-Terrace",
"inspection_date": "2024-01-02",
"total_floor_area": "84.5",
"number_habitable_rooms": "5",
"floor_height": "2.4",
"construction_age_band": "England and Wales: 1950-1966",
"tenure": "owner-occupied",
}
row.update(overrides)
return row
def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
csv_path = tmp_path / "certificates.csv"
fieldnames = [column.upper() for column in EPC_SOURCE_COLUMNS]
row = {column.upper(): value for column, value in _row().items()}
row["NUMBER_HABITABLE_ROOMS"] = "0"
_write_csv(csv_path, fieldnames, [row])
df = _scan_epc_certificates(csv_path, tmp_path).collect()
assert df.to_dicts() == [
{
"epc_address": "1 Example Street",
"epc_postcode": "AA1 1AA",
"current_energy_rating": "C",
"potential_energy_rating": "B",
"epc_property_type": "House",
"built_form": "Mid-Terrace",
"inspection_date": "2024-01-02",
"total_floor_area": 84.5,
"number_habitable_rooms": None,
"floor_height": 2.4,
"construction_age_band": "England and Wales: 1950-1966",
"tenure": "owner-occupied",
}
]
def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
rows_2023 = [_row(address="2 Example Street", inspection_date="2023-03-04")]
rows_2024 = [
_row(
address="3 Example Street",
postcode="BB2 2BB",
inspection_date="2024-05-06",
total_floor_area="",
tenure="Rented (social)",
)
]
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
for member_name, rows in [
("certificates-2023.csv", rows_2023),
("nested/certificates-2024.csv", rows_2024),
]:
csv_text = [",".join(EPC_SOURCE_COLUMNS)]
csv_text.extend(
",".join(row[column] for column in EPC_SOURCE_COLUMNS) for row in rows
)
archive.writestr(member_name, "\n".join(csv_text) + "\n")
archive.writestr("recommendations-2024.csv", "address,postcode\nignored,X\n")
df = _scan_epc_certificates(zip_path, tmp_path).sort("inspection_date").collect()
assert df.select("epc_address", "epc_postcode", "total_floor_area").to_dicts() == [
{
"epc_address": "2 Example Street",
"epc_postcode": "AA1 1AA",
"total_floor_area": 84.5,
},
{
"epc_address": "3 Example Street",
"epc_postcode": "BB2 2BB",
"total_floor_area": None,
},
]
assert df.get_column("tenure").to_list() == ["owner-occupied", "Rented (social)"]
assert df.schema["number_habitable_rooms"] == pl.Int16
def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerows(
[
_row(
current_energy_rating="d",
inspection_date="2023-01-01",
total_floor_area="80",
tenure="Rented (social)",
),
_row(
current_energy_rating="c",
inspection_date="2024-01-01",
total_floor_area="85",
tenure="owner-occupied",
),
]
)
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [None],
"street": ["Example Street"],
"locality": [None],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
assert df.select(
"epc_address",
"current_energy_rating",
"total_floor_area",
"construction_age_band",
"was_council_house",
).to_dicts() == [
{
"epc_address": "1 Example Street",
"current_energy_rating": "C",
"total_floor_area": 85.0,
"construction_age_band": 1950,
"was_council_house": "Yes",
}
]
assert df.get_column("renovation_history").list.len().to_list() == [1]

View file

@ -0,0 +1,33 @@
import polars as pl
from pipeline.transform.merge import (
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
)
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
result = df.lazy().with_columns(
_less_deprived_percentile_expr("Income Score (rate)")
).collect()
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
result = df.lazy().with_columns(
_less_deprived_percentile_expr("Income Score (rate)")
).collect()
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
def test_dynamic_poi_metric_columns_are_area_level() -> None:
assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")

View file

@ -0,0 +1,41 @@
import polars as pl
from pipeline.transform.poi_proximity import _build_poi_category_groups
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
pois = pl.DataFrame(
{
"group": (
["Public Transport"] * 2
+ ["Leisure"] * 2
+ ["Groceries"] * 101
+ ["Groceries"] * 100
+ ["Education"] * 200
+ ["Health"] * 200
),
"category": (
["Rail station", "Bus stop"]
+ ["Café", "Restaurant"]
+ ["Tesco"] * 101
+ ["Waitrose"] * 100
+ ["School"] * 200
+ ["Pharmacy"] * 200
),
"lat": [51.5] * 605,
"lng": [-0.1] * 605,
}
)
groups, display_names = _build_poi_category_groups(pois)
assert set(display_names.values()) == {
"Bus stop",
"Café",
"Rail station",
"Restaurant",
"Tesco",
}
assert "poi_waitrose" not in groups
assert "poi_school" not in groups
assert "poi_pharmacy" not in groups

View file

@ -79,6 +79,33 @@ def test_transform_grocery_retail_points_keeps_fascia_icon_category():
]
def test_transform_grocery_retail_points_accepts_base_fascias():
raw = pl.DataFrame(
{
"id": [101, 102, 103, 104],
"retailer": ["Aldi", "Asda", "Booths", "Whole Foods Market"],
"fascia": ["Aldi", "Asda Superstore", "Booths", "Whole Foods Market"],
"store_name": [
"Aldi Test",
"Asda Test Superstore",
"Booths Test",
"Whole Foods Test",
],
"long_wgs": [-0.141, -0.142, -0.143, -0.144],
"lat_wgs": [51.515, 51.516, 51.517, 51.518],
}
)
pois = transform_grocery_retail_points(raw)
assert pois.select("category", "icon_category").to_dicts() == [
{"category": "Aldi", "icon_category": "Aldi"},
{"category": "Asda", "icon_category": "Asda Superstore"},
{"category": "Booths", "icon_category": "Booths"},
{"category": "Whole Foods Market", "icon_category": "Whole Foods Market"},
]
def test_transform_grocery_retail_points_drops_invalid_rows():
raw = pl.DataFrame(
{

View file

@ -1078,19 +1078,40 @@ COOP_RETAILERS = {
}
GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
"Aldi": "Aldi",
"Asda": "Asda",
"Booths": "Booths",
"Budgens": "Budgens",
"Centra": "Centra",
"Cook": "COOK",
"Costco": "Costco",
"Dunnes Stores": "Dunnes Stores",
"Farmfoods": "Farmfoods",
"Heron": "Heron Foods",
"Iceland": "Iceland",
"Lidl": "Lidl",
"Makro": "Makro",
"Marks and Spencer": "M&S",
"Morrisons": "Morrisons",
"Planet Organic": "Planet Organic",
"Sainsburys": "Sainsbury's",
"Spar": "Spar",
"Tesco": "Tesco",
"Waitrose": "Waitrose",
"Whole Foods Market": "Whole Foods Market",
**{retailer: "Co-op" for retailer in COOP_RETAILERS},
}
GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
**GROCERY_RETAILER_DISPLAY_NAMES,
"Aldi Local": "Aldi",
"Asda Express": "Asda Express",
"Asda Living": "Asda Living",
"Asda PFS": "Asda PFS",
"Asda Supercentre": "Asda Supercentre",
"Asda Supermarket": "Asda Supermarket",
"Asda Superstore": "Asda Superstore",
"Cooltrader": "Heron Foods",
"Co-op Food": "Co-op",
"Cook": "COOK",
@ -1112,6 +1133,7 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
"Marks and Spencer Travel SF": "M&S Food",
"Morrisons Daily": "Morrisons Daily",
"Morrisons Select": "Morrisons",
"Sainsbury's Local": "Sainsbury's Local",
"Sainsburys": "Sainsbury's",
"Sainsburys Local": "Sainsbury's Local",
"Spar PFS": "Spar",
@ -1128,12 +1150,18 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
if display_name is None:
raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
return display_name
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
if fascia:
return GROCERY_FASCIA_ICON_NAMES.get(fascia, normalize_grocery_retailer(fascia))
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
if icon_name is None:
raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
return icon_name
return normalize_grocery_retailer(retailer)