Rerun data pipelines
This commit is contained in:
parent
4c95815dc8
commit
fc10381692
27 changed files with 2143 additions and 215 deletions
|
|
@ -1,12 +1,32 @@
|
|||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
|
||||
|
||||
|
||||
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
street_csvs = [path for path in csvs if STREET_CRIME_CSV_RE.fullmatch(path.name)]
|
||||
return street_csvs, len(csvs) - len(street_csvs)
|
||||
|
||||
|
||||
def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
|
||||
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
|
||||
if not csvs:
|
||||
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
|
||||
|
||||
month_count = len({path.parent.name for path in csvs})
|
||||
print(
|
||||
f"Found {len(csvs)} street crime CSV files across {month_count} months"
|
||||
+ (
|
||||
f" (ignored {ignored_csv_count} non-street CSVs)"
|
||||
if ignored_csv_count
|
||||
else ""
|
||||
)
|
||||
)
|
||||
|
||||
df = pl.scan_csv(
|
||||
csvs,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,15 @@
|
|||
import argparse
|
||||
import polars as pl
|
||||
import csv
|
||||
import io
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
import pyarrow as pa
|
||||
import pyarrow.csv as pa_csv
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
from ..utils import fuzzy_join_on_postcode
|
||||
|
||||
|
||||
|
|
@ -8,12 +17,168 @@ pl.Config.set_tbl_cols(-1)
|
|||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
MIN_PRICE = 50_000
|
||||
EPC_SOURCE_COLUMNS = [
|
||||
"address",
|
||||
"postcode",
|
||||
"current_energy_rating",
|
||||
"potential_energy_rating",
|
||||
"property_type",
|
||||
"built_form",
|
||||
"inspection_date",
|
||||
"total_floor_area",
|
||||
"number_habitable_rooms",
|
||||
"floor_height",
|
||||
"construction_age_band",
|
||||
"tenure",
|
||||
]
|
||||
|
||||
|
||||
def _normalise_csv_columns(columns: list[str]) -> list[str]:
|
||||
return [column.strip().lower() for column in columns]
|
||||
|
||||
|
||||
def _clean_string(column: str) -> pl.Expr:
|
||||
stripped = pl.col(column).cast(pl.String).str.strip_chars()
|
||||
return pl.when(stripped == "").then(None).otherwise(stripped)
|
||||
|
||||
|
||||
def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
|
||||
return _clean_string(column).cast(dtype, strict=False)
|
||||
|
||||
|
||||
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
||||
return (
|
||||
raw.select(
|
||||
_clean_string("address").alias("epc_address"),
|
||||
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
|
||||
_clean_string("current_energy_rating")
|
||||
.str.to_uppercase()
|
||||
.alias("current_energy_rating"),
|
||||
_clean_string("potential_energy_rating")
|
||||
.str.to_uppercase()
|
||||
.alias("potential_energy_rating"),
|
||||
_clean_string("property_type").alias("epc_property_type"),
|
||||
_clean_string("built_form").alias("built_form"),
|
||||
_clean_string("inspection_date").alias("inspection_date"),
|
||||
_clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
|
||||
_clean_number("number_habitable_rooms", pl.Int16).alias(
|
||||
"number_habitable_rooms"
|
||||
),
|
||||
_clean_number("floor_height", pl.Float64).alias("floor_height"),
|
||||
_clean_string("construction_age_band").alias("construction_age_band"),
|
||||
_clean_string("tenure").alias("tenure"),
|
||||
)
|
||||
.filter(pl.col("epc_address").is_not_null())
|
||||
.with_columns(
|
||||
pl.when(pl.col("number_habitable_rooms") == 0)
|
||||
.then(None)
|
||||
.otherwise(pl.col("number_habitable_rooms"))
|
||||
.alias("number_habitable_rooms"),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _certificate_member_names(zip_file: zipfile.ZipFile) -> list[str]:
|
||||
return sorted(
|
||||
name
|
||||
for name in zip_file.namelist()
|
||||
if not name.endswith("/")
|
||||
and Path(name).name.lower().startswith("certificates")
|
||||
and name.lower().endswith(".csv")
|
||||
)
|
||||
|
||||
|
||||
def _read_zip_csv_header(zip_file: zipfile.ZipFile, member_name: str) -> list[str]:
|
||||
with zip_file.open(member_name) as member:
|
||||
text = io.TextIOWrapper(member, encoding="utf-8-sig", newline="")
|
||||
try:
|
||||
return next(csv.reader(text))
|
||||
except StopIteration as exc:
|
||||
raise ValueError(f"EPC CSV member is empty: {member_name}") from exc
|
||||
|
||||
|
||||
def _source_columns_for_header(header: list[str]) -> list[str]:
|
||||
columns_by_normalised_name = {
|
||||
normalised: source
|
||||
for source, normalised in zip(header, _normalise_csv_columns(header))
|
||||
}
|
||||
return [
|
||||
columns_by_normalised_name.get(column, column) for column in EPC_SOURCE_COLUMNS
|
||||
]
|
||||
|
||||
|
||||
def _zip_certificates_to_parquet(zip_path: Path, output_path: Path) -> None:
|
||||
schema = pa.schema((column, pa.string()) for column in EPC_SOURCE_COLUMNS)
|
||||
writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd")
|
||||
|
||||
try:
|
||||
try:
|
||||
zip_file = zipfile.ZipFile(zip_path)
|
||||
except zipfile.BadZipFile as exc:
|
||||
raise ValueError(
|
||||
f"{zip_path} is not a readable EPC zip archive; re-download "
|
||||
"domestic-csv.zip and try again"
|
||||
) from exc
|
||||
|
||||
with zip_file:
|
||||
member_names = _certificate_member_names(zip_file)
|
||||
if not member_names:
|
||||
raise ValueError(f"No certificate CSV files found in {zip_path}")
|
||||
|
||||
for member_name in member_names:
|
||||
print(f"Reading EPC certificates from {member_name}")
|
||||
source_columns = _source_columns_for_header(
|
||||
_read_zip_csv_header(zip_file, member_name)
|
||||
)
|
||||
convert_options = pa_csv.ConvertOptions(
|
||||
include_columns=source_columns,
|
||||
include_missing_columns=True,
|
||||
column_types={
|
||||
source_column: pa.string() for source_column in source_columns
|
||||
},
|
||||
strings_can_be_null=True,
|
||||
)
|
||||
read_options = pa_csv.ReadOptions(block_size=64 * 1024 * 1024)
|
||||
|
||||
with zip_file.open(member_name) as member:
|
||||
reader = pa_csv.open_csv(
|
||||
member,
|
||||
read_options=read_options,
|
||||
convert_options=convert_options,
|
||||
)
|
||||
while True:
|
||||
try:
|
||||
batch = reader.read_next_batch()
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
if batch.num_rows == 0:
|
||||
continue
|
||||
|
||||
writer.write_batch(batch.rename_columns(EPC_SOURCE_COLUMNS))
|
||||
finally:
|
||||
writer.close()
|
||||
|
||||
|
||||
def _scan_epc_certificates(epc_path: Path, temp_dir: Path) -> pl.LazyFrame:
|
||||
if epc_path.suffix.lower() == ".zip":
|
||||
parquet_path = temp_dir / "epc-certificates.parquet"
|
||||
_zip_certificates_to_parquet(epc_path, parquet_path)
|
||||
raw = pl.scan_parquet(parquet_path)
|
||||
else:
|
||||
raw = pl.scan_csv(
|
||||
epc_path,
|
||||
infer_schema=False,
|
||||
with_column_names=_normalise_csv_columns,
|
||||
)
|
||||
|
||||
return _select_epc_columns(raw)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
|
||||
parser.add_argument(
|
||||
"--epc", type=Path, required=True, help="EPC certificates CSV file"
|
||||
"--epc", type=Path, required=True, help="EPC certificates CSV file or zip"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--price-paid", type=Path, required=True, help="Price paid parquet file"
|
||||
|
|
@ -23,74 +188,56 @@ def main():
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
epc_base = (
|
||||
pl.scan_csv(args.epc)
|
||||
.select(
|
||||
pl.col("ADDRESS").alias("epc_address"),
|
||||
"POSTCODE",
|
||||
"CURRENT_ENERGY_RATING",
|
||||
"POTENTIAL_ENERGY_RATING",
|
||||
pl.col("PROPERTY_TYPE").alias("epc_property_type"),
|
||||
"BUILT_FORM",
|
||||
"INSPECTION_DATE",
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"NUMBER_HABITABLE_ROOMS",
|
||||
"FLOOR_HEIGHT",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"TENURE",
|
||||
)
|
||||
.filter(pl.col("epc_address").is_not_null())
|
||||
.with_columns(
|
||||
pl.when(pl.col("NUMBER_HABITABLE_ROOMS") == 0)
|
||||
.then(None)
|
||||
.otherwise(pl.col("NUMBER_HABITABLE_ROOMS"))
|
||||
.alias("NUMBER_HABITABLE_ROOMS"),
|
||||
)
|
||||
)
|
||||
with tempfile.TemporaryDirectory(prefix="epc_certificates_") as tmpdir:
|
||||
_run(args.epc, args.price_paid, args.output, Path(tmpdir))
|
||||
|
||||
|
||||
def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Path):
|
||||
epc_base = _scan_epc_certificates(epc_path, temp_dir)
|
||||
|
||||
# Dedup fork: keep latest certificate per property (existing logic)
|
||||
epc = (
|
||||
epc_base.sort("INSPECTION_DATE", descending=True)
|
||||
.group_by("epc_address", "POSTCODE")
|
||||
epc_base.sort("inspection_date", descending=True)
|
||||
.group_by("epc_address", "epc_postcode")
|
||||
.first()
|
||||
.drop("TENURE")
|
||||
.drop("tenure")
|
||||
)
|
||||
|
||||
# Events fork: detect renovation events between consecutive certificates
|
||||
# Collect eagerly because .over() window functions don't work in streaming
|
||||
# engine (fuzzy_join.py:50 uses sink_parquet which requires streaming).
|
||||
events = (
|
||||
epc_base.sort("INSPECTION_DATE")
|
||||
epc_base.sort("inspection_date")
|
||||
.with_columns(
|
||||
pl.col("CURRENT_ENERGY_RATING")
|
||||
pl.col("current_energy_rating")
|
||||
.replace_strict(RATING_RANK, default=None, return_dtype=pl.Int32)
|
||||
.alias("_rating_rank"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("NUMBER_HABITABLE_ROOMS")
|
||||
pl.col("number_habitable_rooms")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.over("epc_address", "epc_postcode")
|
||||
.alias("_prev_rooms"),
|
||||
pl.col("TOTAL_FLOOR_AREA")
|
||||
pl.col("total_floor_area")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.over("epc_address", "epc_postcode")
|
||||
.alias("_prev_area"),
|
||||
pl.col("_rating_rank")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.over("epc_address", "epc_postcode")
|
||||
.alias("_prev_rating_rank"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.when(
|
||||
pl.col("NUMBER_HABITABLE_ROOMS").is_not_null()
|
||||
pl.col("number_habitable_rooms").is_not_null()
|
||||
& pl.col("_prev_rooms").is_not_null()
|
||||
& (pl.col("NUMBER_HABITABLE_ROOMS") != pl.col("_prev_rooms"))
|
||||
& (pl.col("number_habitable_rooms") != pl.col("_prev_rooms"))
|
||||
)
|
||||
.then(pl.lit("Remodelling"))
|
||||
.when(
|
||||
pl.col("TOTAL_FLOOR_AREA").is_not_null()
|
||||
pl.col("total_floor_area").is_not_null()
|
||||
& pl.col("_prev_area").is_not_null()
|
||||
& (pl.col("TOTAL_FLOOR_AREA") > pl.col("_prev_area"))
|
||||
& (pl.col("total_floor_area") > pl.col("_prev_area"))
|
||||
)
|
||||
.then(pl.lit("Extension"))
|
||||
.when(
|
||||
|
|
@ -104,13 +251,13 @@ def main():
|
|||
)
|
||||
.filter(pl.col("_event").is_not_null())
|
||||
.with_columns(
|
||||
pl.col("INSPECTION_DATE")
|
||||
pl.col("inspection_date")
|
||||
.cast(pl.String)
|
||||
.str.slice(0, 4)
|
||||
.cast(pl.Int32)
|
||||
.alias("_event_year"),
|
||||
)
|
||||
.group_by("epc_address", "POSTCODE")
|
||||
.group_by("epc_address", "epc_postcode")
|
||||
.agg(
|
||||
pl.struct(
|
||||
pl.col("_event_year").alias("year"),
|
||||
|
|
@ -128,8 +275,8 @@ def main():
|
|||
|
||||
# Social tenure fork: flag properties that were ever social housing
|
||||
social_tenure = (
|
||||
epc_base.filter(pl.col("TENURE").str.to_lowercase().str.contains("social"))
|
||||
.select("epc_address", "POSTCODE")
|
||||
epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
|
||||
.select("epc_address", "epc_postcode")
|
||||
.unique()
|
||||
.with_columns(pl.lit("Yes").alias("was_council_house"))
|
||||
.collect()
|
||||
|
|
@ -140,12 +287,12 @@ def main():
|
|||
epc = (
|
||||
epc.join(
|
||||
events.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
on=["epc_address", "epc_postcode"],
|
||||
how="left",
|
||||
)
|
||||
.join(
|
||||
social_tenure.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
on=["epc_address", "epc_postcode"],
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
|
|
@ -167,7 +314,7 @@ def main():
|
|||
duration_map = {"F": "Freehold", "L": "Leasehold"}
|
||||
|
||||
price_paid = (
|
||||
pl.scan_parquet(args.price_paid)
|
||||
pl.scan_parquet(price_paid_path)
|
||||
.select(
|
||||
"price",
|
||||
"date_of_transfer",
|
||||
|
|
@ -219,9 +366,9 @@ def main():
|
|||
left_address_col="pp_address",
|
||||
right_address_col="epc_address",
|
||||
left_postcode_col="postcode",
|
||||
right_postcode_col="POSTCODE",
|
||||
right_postcode_col="epc_postcode",
|
||||
)
|
||||
.drop("POSTCODE")
|
||||
.drop("epc_postcode")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
|
|
@ -236,7 +383,7 @@ def main():
|
|||
# For new-builds (old_new == "Y"), use the first transaction date year as
|
||||
# the exact construction date; otherwise fall back to the EPC age band.
|
||||
epc_band_year = (
|
||||
pl.col("CONSTRUCTION_AGE_BAND")
|
||||
pl.col("construction_age_band")
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
|
|
@ -251,7 +398,7 @@ def main():
|
|||
pl.when(is_new_build & transfer_year.is_not_null())
|
||||
.then(transfer_year)
|
||||
.otherwise(epc_band_year)
|
||||
.alias("CONSTRUCTION_AGE_BAND"),
|
||||
.alias("construction_age_band"),
|
||||
pl.when(is_new_build & transfer_year.is_not_null())
|
||||
.then(pl.lit(0, dtype=pl.UInt8))
|
||||
.when(epc_band_year.is_not_null())
|
||||
|
|
@ -263,8 +410,8 @@ def main():
|
|||
joined = joined.rename({col: col.lower() for col in joined.columns})
|
||||
|
||||
print(joined.head())
|
||||
joined.write_parquet(args.output)
|
||||
print(f"Wrote {args.output}")
|
||||
joined.write_parquet(output_path)
|
||||
print(f"Wrote {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
|
||||
_IOD_PERCENTILE_COLUMNS = [
|
||||
"Education, Skills and Training Score",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
]
|
||||
|
||||
|
||||
_AREA_COLUMNS = [
|
||||
"Postcode",
|
||||
|
|
@ -76,6 +85,24 @@ _AREA_COLUMNS = [
|
|||
]
|
||||
|
||||
|
||||
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
||||
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
|
||||
non_null_count = pl.col(column).count()
|
||||
descending_rank = pl.col(column).rank("average", descending=True)
|
||||
return (
|
||||
pl.when(pl.col(column).is_null())
|
||||
.then(None)
|
||||
.when(pl.col(column) == pl.col(column).min())
|
||||
.then(100.0)
|
||||
.when(pl.col(column) == pl.col(column).max())
|
||||
.then(0.0)
|
||||
.when(non_null_count > 1)
|
||||
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
|
||||
.otherwise(100.0)
|
||||
.alias(column)
|
||||
)
|
||||
|
||||
|
||||
def _build(
|
||||
epc_pp_path: Path,
|
||||
arcgis_path: Path,
|
||||
|
|
@ -134,20 +161,11 @@ def _build(
|
|||
)
|
||||
wide = wide.join(arcgis, on="postcode", how="left")
|
||||
|
||||
iod = pl.scan_parquet(iod_path)
|
||||
iod = pl.scan_parquet(iod_path).with_columns(
|
||||
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
|
||||
)
|
||||
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
|
||||
# Invert deprivation scores so that higher values = less deprived (better)
|
||||
iod_score_cols = [
|
||||
"Education, Skills and Training Score",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
]
|
||||
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
|
||||
|
||||
ethnicity = pl.scan_parquet(ethnicity_path)
|
||||
wide = wide.join(
|
||||
ethnicity,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
|
@ -15,9 +17,25 @@ POI_GROUPS_2KM = {
|
|||
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
|
||||
}
|
||||
|
||||
# Groups for which to compute distance to nearest POI (from filtered POIs)
|
||||
# Groups for which to compute distance to nearest POI (from filtered POIs).
|
||||
# Keep `train_tube` for the existing backend feature; the individual POI
|
||||
# distance filters below power the frontend dropdown.
|
||||
DISTANCE_GROUPS = {
|
||||
"train_tube": ["Tube station", "Rail station"],
|
||||
"grocery_store": [
|
||||
"Greengrocer",
|
||||
"Supermarket",
|
||||
"Convenience Store",
|
||||
"Waitrose",
|
||||
"Tesco",
|
||||
],
|
||||
"tube_station": ["Tube station"],
|
||||
"rail_station": ["Rail station"],
|
||||
"waitrose": ["Waitrose"],
|
||||
"tesco": ["Tesco"],
|
||||
"cafe": ["Café"],
|
||||
"pub": ["Pub"],
|
||||
"restaurant": ["Restaurant"],
|
||||
}
|
||||
|
||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||
|
|
@ -27,6 +45,69 @@ GREENSPACE_PARK_FUNCTIONS = {
|
|||
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
|
||||
}
|
||||
|
||||
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
|
||||
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
|
||||
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
|
||||
|
||||
|
||||
def _poi_category_slug(category: str) -> str:
|
||||
ascii_text = (
|
||||
unicodedata.normalize("NFKD", category)
|
||||
.encode("ascii", "ignore")
|
||||
.decode("ascii")
|
||||
.lower()
|
||||
)
|
||||
slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
|
||||
return slug or "poi"
|
||||
|
||||
|
||||
def _build_poi_category_groups(
|
||||
pois: pl.DataFrame,
|
||||
) -> tuple[dict[str, list[str]], dict[str, str]]:
|
||||
"""Build one proximity group for each POI category selected for filters."""
|
||||
if "group" not in pois.columns:
|
||||
raise ValueError("POI dataframe must include a 'group' column")
|
||||
|
||||
categories = (
|
||||
pois.group_by("group", "category")
|
||||
.len()
|
||||
.filter(
|
||||
pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
|
||||
| (
|
||||
pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
|
||||
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
|
||||
)
|
||||
)
|
||||
.select("category")
|
||||
.sort("category")
|
||||
.to_series()
|
||||
.to_list()
|
||||
)
|
||||
used_slugs: dict[str, int] = {}
|
||||
groups: dict[str, list[str]] = {}
|
||||
display_names: dict[str, str] = {}
|
||||
|
||||
for category in categories:
|
||||
if not isinstance(category, str) or not category:
|
||||
continue
|
||||
base_slug = f"poi_{_poi_category_slug(category)}"
|
||||
slug_count = used_slugs.get(base_slug, 0)
|
||||
used_slugs[base_slug] = slug_count + 1
|
||||
group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
|
||||
groups[group_key] = [category]
|
||||
display_names[group_key] = category
|
||||
|
||||
return groups, display_names
|
||||
|
||||
|
||||
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
|
||||
renames: dict[str, str] = {}
|
||||
for group_key, category in display_names.items():
|
||||
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
|
||||
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
|
||||
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
|
||||
return renames
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
|
|
@ -56,12 +137,35 @@ def main():
|
|||
)
|
||||
|
||||
pois = pl.read_parquet(args.pois)
|
||||
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
|
||||
|
||||
# Count amenity POIs within 2km
|
||||
counts_2km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
|
||||
)
|
||||
|
||||
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
|
||||
# the selected public transport, grocery, and leisure categories.
|
||||
dynamic_counts_2km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=poi_category_groups, radius_km=2
|
||||
)
|
||||
dynamic_counts_5km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=poi_category_groups, radius_km=5
|
||||
)
|
||||
dynamic_distances = min_distance_per_postcode(
|
||||
postcodes, pois, groups=poi_category_groups
|
||||
)
|
||||
dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
|
||||
dynamic_counts_2km = dynamic_counts_2km.rename(
|
||||
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
|
||||
)
|
||||
dynamic_counts_5km = dynamic_counts_5km.rename(
|
||||
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
|
||||
)
|
||||
dynamic_distances = dynamic_distances.rename(
|
||||
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
|
||||
)
|
||||
|
||||
# Distance to nearest train/tube station (from filtered POIs)
|
||||
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
|
||||
|
||||
|
|
@ -77,6 +181,9 @@ def main():
|
|||
# Join all results on postcode
|
||||
result = (
|
||||
counts_2km.join(distances, on="postcode")
|
||||
.join(dynamic_counts_2km, on="postcode")
|
||||
.join(dynamic_counts_5km, on="postcode")
|
||||
.join(dynamic_distances, on="postcode")
|
||||
.join(park_counts_1km, on="postcode")
|
||||
.join(park_distances, on="postcode")
|
||||
)
|
||||
|
|
|
|||
47
pipeline/transform/test_crime.py
Normal file
47
pipeline/transform/test_crime.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.crime import find_street_crime_csvs, transform_crime
|
||||
|
||||
|
||||
def test_find_street_crime_csvs_ignores_archive_sidecars(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2024-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
street = month_dir / "2024-01-test-force-street.csv"
|
||||
street.touch()
|
||||
(month_dir / "2024-01-test-force-outcomes.csv").touch()
|
||||
(month_dir / "2024-01-test-force-stop-and-search.csv").touch()
|
||||
(crime_dir / "notes.csv").touch()
|
||||
|
||||
csvs, ignored_count = find_street_crime_csvs(crime_dir)
|
||||
|
||||
assert csvs == [street]
|
||||
assert ignored_count == 3
|
||||
|
||||
|
||||
def test_transform_crime_reads_only_street_crime_csvs(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2024-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
|
||||
(month_dir / "2024-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
|
||||
"1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
|
||||
"2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
|
||||
"3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,,No LSOA,Robbery,Under investigation,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
(month_dir / "2024-01-test-force-outcomes.csv").write_text(
|
||||
"Crime ID,Month,Reported by,Outcome type\n1,2024-01,Test Force,Charged\n"
|
||||
)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
transform_crime(crime_dir, output)
|
||||
|
||||
result = pl.read_parquet(output).to_dicts()
|
||||
|
||||
assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 2.0}]
|
||||
174
pipeline/transform/test_join_epc_pp.py
Normal file
174
pipeline/transform/test_join_epc_pp.py
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
import csv
|
||||
import io
|
||||
import zipfile
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.join_epc_pp import (
|
||||
EPC_SOURCE_COLUMNS,
|
||||
_run,
|
||||
_scan_epc_certificates,
|
||||
)
|
||||
|
||||
|
||||
def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
|
||||
with path.open("w", newline="") as file:
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def _row(**overrides: str) -> dict[str, str]:
|
||||
row = {
|
||||
"address": "1 Example Street",
|
||||
"postcode": " aa1 1aa ",
|
||||
"current_energy_rating": "c",
|
||||
"potential_energy_rating": "b",
|
||||
"property_type": "House",
|
||||
"built_form": "Mid-Terrace",
|
||||
"inspection_date": "2024-01-02",
|
||||
"total_floor_area": "84.5",
|
||||
"number_habitable_rooms": "5",
|
||||
"floor_height": "2.4",
|
||||
"construction_age_band": "England and Wales: 1950-1966",
|
||||
"tenure": "owner-occupied",
|
||||
}
|
||||
row.update(overrides)
|
||||
return row
|
||||
|
||||
|
||||
def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
|
||||
csv_path = tmp_path / "certificates.csv"
|
||||
fieldnames = [column.upper() for column in EPC_SOURCE_COLUMNS]
|
||||
row = {column.upper(): value for column, value in _row().items()}
|
||||
row["NUMBER_HABITABLE_ROOMS"] = "0"
|
||||
_write_csv(csv_path, fieldnames, [row])
|
||||
|
||||
df = _scan_epc_certificates(csv_path, tmp_path).collect()
|
||||
|
||||
assert df.to_dicts() == [
|
||||
{
|
||||
"epc_address": "1 Example Street",
|
||||
"epc_postcode": "AA1 1AA",
|
||||
"current_energy_rating": "C",
|
||||
"potential_energy_rating": "B",
|
||||
"epc_property_type": "House",
|
||||
"built_form": "Mid-Terrace",
|
||||
"inspection_date": "2024-01-02",
|
||||
"total_floor_area": 84.5,
|
||||
"number_habitable_rooms": None,
|
||||
"floor_height": 2.4,
|
||||
"construction_age_band": "England and Wales: 1950-1966",
|
||||
"tenure": "owner-occupied",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
rows_2023 = [_row(address="2 Example Street", inspection_date="2023-03-04")]
|
||||
rows_2024 = [
|
||||
_row(
|
||||
address="3 Example Street",
|
||||
postcode="BB2 2BB",
|
||||
inspection_date="2024-05-06",
|
||||
total_floor_area="",
|
||||
tenure="Rented (social)",
|
||||
)
|
||||
]
|
||||
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
for member_name, rows in [
|
||||
("certificates-2023.csv", rows_2023),
|
||||
("nested/certificates-2024.csv", rows_2024),
|
||||
]:
|
||||
csv_text = [",".join(EPC_SOURCE_COLUMNS)]
|
||||
csv_text.extend(
|
||||
",".join(row[column] for column in EPC_SOURCE_COLUMNS) for row in rows
|
||||
)
|
||||
archive.writestr(member_name, "\n".join(csv_text) + "\n")
|
||||
archive.writestr("recommendations-2024.csv", "address,postcode\nignored,X\n")
|
||||
|
||||
df = _scan_epc_certificates(zip_path, tmp_path).sort("inspection_date").collect()
|
||||
|
||||
assert df.select("epc_address", "epc_postcode", "total_floor_area").to_dicts() == [
|
||||
{
|
||||
"epc_address": "2 Example Street",
|
||||
"epc_postcode": "AA1 1AA",
|
||||
"total_floor_area": 84.5,
|
||||
},
|
||||
{
|
||||
"epc_address": "3 Example Street",
|
||||
"epc_postcode": "BB2 2BB",
|
||||
"total_floor_area": None,
|
||||
},
|
||||
]
|
||||
assert df.get_column("tenure").to_list() == ["owner-occupied", "Rented (social)"]
|
||||
assert df.schema["number_habitable_rooms"] == pl.Int16
|
||||
|
||||
|
||||
def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerows(
|
||||
[
|
||||
_row(
|
||||
current_energy_rating="d",
|
||||
inspection_date="2023-01-01",
|
||||
total_floor_area="80",
|
||||
tenure="Rented (social)",
|
||||
),
|
||||
_row(
|
||||
current_energy_rating="c",
|
||||
inspection_date="2024-01-01",
|
||||
total_floor_area="85",
|
||||
tenure="owner-occupied",
|
||||
),
|
||||
]
|
||||
)
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000],
|
||||
"date_of_transfer": [date(2024, 2, 3)],
|
||||
"property_type": ["T"],
|
||||
"postcode": ["AA1 1AA"],
|
||||
"paon": ["1"],
|
||||
"saon": [None],
|
||||
"street": ["Example Street"],
|
||||
"locality": [None],
|
||||
"town_city": ["Exampletown"],
|
||||
"duration": ["F"],
|
||||
"old_new": ["N"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
assert df.select(
|
||||
"epc_address",
|
||||
"current_energy_rating",
|
||||
"total_floor_area",
|
||||
"construction_age_band",
|
||||
"was_council_house",
|
||||
).to_dicts() == [
|
||||
{
|
||||
"epc_address": "1 Example Street",
|
||||
"current_energy_rating": "C",
|
||||
"total_floor_area": 85.0,
|
||||
"construction_age_band": 1950,
|
||||
"was_council_house": "Yes",
|
||||
}
|
||||
]
|
||||
assert df.get_column("renovation_history").list.len().to_list() == [1]
|
||||
33
pipeline/transform/test_merge.py
Normal file
33
pipeline/transform/test_merge.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
)
|
||||
|
||||
|
||||
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
|
||||
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_less_deprived_percentile_expr("Income Score (rate)")
|
||||
).collect()
|
||||
|
||||
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
|
||||
|
||||
|
||||
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
|
||||
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_less_deprived_percentile_expr("Income Score (rate)")
|
||||
).collect()
|
||||
|
||||
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
|
||||
|
||||
|
||||
def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
||||
assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
|
||||
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
|
||||
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
|
||||
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
||||
41
pipeline/transform/test_poi_proximity.py
Normal file
41
pipeline/transform/test_poi_proximity.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.poi_proximity import _build_poi_category_groups
|
||||
|
||||
|
||||
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"group": (
|
||||
["Public Transport"] * 2
|
||||
+ ["Leisure"] * 2
|
||||
+ ["Groceries"] * 101
|
||||
+ ["Groceries"] * 100
|
||||
+ ["Education"] * 200
|
||||
+ ["Health"] * 200
|
||||
),
|
||||
"category": (
|
||||
["Rail station", "Bus stop"]
|
||||
+ ["Café", "Restaurant"]
|
||||
+ ["Tesco"] * 101
|
||||
+ ["Waitrose"] * 100
|
||||
+ ["School"] * 200
|
||||
+ ["Pharmacy"] * 200
|
||||
),
|
||||
"lat": [51.5] * 605,
|
||||
"lng": [-0.1] * 605,
|
||||
}
|
||||
)
|
||||
|
||||
groups, display_names = _build_poi_category_groups(pois)
|
||||
|
||||
assert set(display_names.values()) == {
|
||||
"Bus stop",
|
||||
"Café",
|
||||
"Rail station",
|
||||
"Restaurant",
|
||||
"Tesco",
|
||||
}
|
||||
assert "poi_waitrose" not in groups
|
||||
assert "poi_school" not in groups
|
||||
assert "poi_pharmacy" not in groups
|
||||
|
|
@ -79,6 +79,33 @@ def test_transform_grocery_retail_points_keeps_fascia_icon_category():
|
|||
]
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_accepts_base_fascias():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": [101, 102, 103, 104],
|
||||
"retailer": ["Aldi", "Asda", "Booths", "Whole Foods Market"],
|
||||
"fascia": ["Aldi", "Asda Superstore", "Booths", "Whole Foods Market"],
|
||||
"store_name": [
|
||||
"Aldi Test",
|
||||
"Asda Test Superstore",
|
||||
"Booths Test",
|
||||
"Whole Foods Test",
|
||||
],
|
||||
"long_wgs": [-0.141, -0.142, -0.143, -0.144],
|
||||
"lat_wgs": [51.515, 51.516, 51.517, 51.518],
|
||||
}
|
||||
)
|
||||
|
||||
pois = transform_grocery_retail_points(raw)
|
||||
|
||||
assert pois.select("category", "icon_category").to_dicts() == [
|
||||
{"category": "Aldi", "icon_category": "Aldi"},
|
||||
{"category": "Asda", "icon_category": "Asda Superstore"},
|
||||
{"category": "Booths", "icon_category": "Booths"},
|
||||
{"category": "Whole Foods Market", "icon_category": "Whole Foods Market"},
|
||||
]
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_drops_invalid_rows():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1078,19 +1078,40 @@ COOP_RETAILERS = {
|
|||
}
|
||||
|
||||
GROCERY_RETAILER_DISPLAY_NAMES: dict[str, str] = {
|
||||
"Aldi": "Aldi",
|
||||
"Asda": "Asda",
|
||||
"Booths": "Booths",
|
||||
"Budgens": "Budgens",
|
||||
"Centra": "Centra",
|
||||
"Cook": "COOK",
|
||||
"Costco": "Costco",
|
||||
"Dunnes Stores": "Dunnes Stores",
|
||||
"Farmfoods": "Farmfoods",
|
||||
"Heron": "Heron Foods",
|
||||
"Iceland": "Iceland",
|
||||
"Lidl": "Lidl",
|
||||
"Makro": "Makro",
|
||||
"Marks and Spencer": "M&S",
|
||||
"Morrisons": "Morrisons",
|
||||
"Planet Organic": "Planet Organic",
|
||||
"Sainsburys": "Sainsbury's",
|
||||
"Spar": "Spar",
|
||||
"Tesco": "Tesco",
|
||||
"Waitrose": "Waitrose",
|
||||
"Whole Foods Market": "Whole Foods Market",
|
||||
**{retailer: "Co-op" for retailer in COOP_RETAILERS},
|
||||
}
|
||||
|
||||
|
||||
GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
|
||||
**GROCERY_RETAILER_DISPLAY_NAMES,
|
||||
"Aldi Local": "Aldi",
|
||||
"Asda Express": "Asda Express",
|
||||
"Asda Living": "Asda Living",
|
||||
"Asda PFS": "Asda PFS",
|
||||
"Asda Supercentre": "Asda Supercentre",
|
||||
"Asda Supermarket": "Asda Supermarket",
|
||||
"Asda Superstore": "Asda Superstore",
|
||||
"Cooltrader": "Heron Foods",
|
||||
"Co-op Food": "Co-op",
|
||||
"Cook": "COOK",
|
||||
|
|
@ -1112,6 +1133,7 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
|
|||
"Marks and Spencer Travel SF": "M&S Food",
|
||||
"Morrisons Daily": "Morrisons Daily",
|
||||
"Morrisons Select": "Morrisons",
|
||||
"Sainsbury's Local": "Sainsbury's Local",
|
||||
"Sainsburys": "Sainsbury's",
|
||||
"Sainsburys Local": "Sainsbury's Local",
|
||||
"Spar PFS": "Spar",
|
||||
|
|
@ -1128,12 +1150,18 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
|
|||
def normalize_grocery_retailer(retailer: str | None) -> str:
|
||||
if retailer is None:
|
||||
return ""
|
||||
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
|
||||
display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
|
||||
if display_name is None:
|
||||
raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
|
||||
return display_name
|
||||
|
||||
|
||||
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
|
||||
if fascia:
|
||||
return GROCERY_FASCIA_ICON_NAMES.get(fascia, normalize_grocery_retailer(fascia))
|
||||
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
|
||||
if icon_name is None:
|
||||
raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
|
||||
return icon_name
|
||||
return normalize_grocery_retailer(retailer)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue