LGTM
This commit is contained in:
parent
a8165249a4
commit
a4103b0896
64 changed files with 5376 additions and 3832 deletions
|
|
@ -17,16 +17,17 @@ PARTY_MAP = {
|
|||
"Reform UK": "Reform UK",
|
||||
"Green Party": "Green",
|
||||
}
|
||||
PARTY_GROUPS = [
|
||||
"Labour",
|
||||
"Conservative",
|
||||
"Liberal Democrat",
|
||||
"Reform UK",
|
||||
"Green",
|
||||
"Other parties",
|
||||
]
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading 2024 General Election results...")
|
||||
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
df = pl.read_csv(response.content)
|
||||
print(f"Raw shape: {df.shape}")
|
||||
|
||||
def _convert_results(df: pl.DataFrame) -> pl.DataFrame:
|
||||
# Filter to England only (constituency codes starting with E14)
|
||||
df = df.filter(pl.col("Constituency geographic code").str.starts_with("E14"))
|
||||
|
||||
|
|
@ -70,9 +71,27 @@ def download_and_convert(output_path: Path) -> None:
|
|||
# Rename columns to "% Party" format
|
||||
rename_map = {col: f"% {col}" for col in party_pct.columns if col != "pcon"}
|
||||
party_pct = party_pct.rename(rename_map)
|
||||
for party in PARTY_GROUPS:
|
||||
col = f"% {party}"
|
||||
if col not in party_pct.columns:
|
||||
party_pct = party_pct.with_columns(pl.lit(0.0).alias(col))
|
||||
party_pct = party_pct.with_columns(
|
||||
[pl.col(f"% {party}").fill_null(0.0) for party in PARTY_GROUPS]
|
||||
)
|
||||
|
||||
# Join turnout with party vote shares
|
||||
result = turnout.join(party_pct, on="pcon", how="left")
|
||||
return turnout.join(party_pct, on="pcon", how="left")
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading 2024 General Election results...")
|
||||
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
df = pl.read_csv(response.content)
|
||||
print(f"Raw shape: {df.shape}")
|
||||
|
||||
result = _convert_results(df)
|
||||
|
||||
print(f"Constituencies: {result.height}")
|
||||
print(f"Columns: {result.columns}")
|
||||
|
|
|
|||
|
|
@ -9,15 +9,32 @@ pl.Config.set_tbl_cols(-1)
|
|||
|
||||
URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv"
|
||||
|
||||
GEOGRAPHY_CODE_REPLACEMENTS = {
|
||||
# 2023 Cumberland unitary authority
|
||||
"E07000026": "E06000063", # Allerdale
|
||||
"E07000028": "E06000063", # Carlisle
|
||||
"E07000029": "E06000063", # Copeland
|
||||
# 2023 Westmorland and Furness unitary authority
|
||||
"E07000027": "E06000064", # Barrow-in-Furness
|
||||
"E07000030": "E06000064", # Eden
|
||||
"E07000031": "E06000064", # South Lakeland
|
||||
# 2023 North Yorkshire unitary authority
|
||||
"E07000163": "E06000065", # Craven
|
||||
"E07000164": "E06000065", # Hambleton
|
||||
"E07000165": "E06000065", # Harrogate
|
||||
"E07000166": "E06000065", # Richmondshire
|
||||
"E07000167": "E06000065", # Ryedale
|
||||
"E07000168": "E06000065", # Scarborough
|
||||
"E07000169": "E06000065", # Selby
|
||||
# 2023 Somerset unitary authority
|
||||
"E07000187": "E06000066", # Mendip
|
||||
"E07000188": "E06000066", # Sedgemoor
|
||||
"E07000189": "E06000066", # South Somerset
|
||||
"E07000246": "E06000066", # Somerset West and Taunton
|
||||
}
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading ethnicity data...")
|
||||
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
df = pl.read_csv(response.content)
|
||||
print(f"Raw shape: {df.head(100)}")
|
||||
|
||||
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
|
||||
# Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity,
|
||||
# then aggregate back to the broad groups plus South Asian / East Asian split.
|
||||
detailed = df.filter(
|
||||
|
|
@ -55,11 +72,20 @@ def download_and_convert(output_path: Path) -> None:
|
|||
|
||||
detailed = detailed.with_columns(
|
||||
pl.col("Ethnicity").replace_strict(group_map).alias("group"),
|
||||
pl.col("Geography_code")
|
||||
.replace(GEOGRAPHY_CODE_REPLACEMENTS)
|
||||
.alias("output_geography_code"),
|
||||
pl.col("Ethnic Population").cast(pl.Float64, strict=False).alias("_population"),
|
||||
)
|
||||
|
||||
# Sum percentages within each group per local authority (keep full precision)
|
||||
grouped = detailed.group_by("Geography_code", "group").agg(pl.col("Value1").sum())
|
||||
wide = grouped.pivot(on="group", index="Geography_code", values="Value1")
|
||||
# Sum counts, not rounded percentages, so old districts can be safely
|
||||
# recombined into their current unitary authorities.
|
||||
grouped = detailed.group_by("output_geography_code", "group").agg(
|
||||
pl.col("_population").sum()
|
||||
)
|
||||
wide = grouped.pivot(
|
||||
on="group", index="output_geography_code", values="_population"
|
||||
).rename({"output_geography_code": "Geography_code"})
|
||||
|
||||
# Normalize so each row sums to exactly 100%, then round using largest-remainder
|
||||
# method to preserve the sum. Independent rounding of 6 values can drift ±0.3.
|
||||
|
|
@ -89,6 +115,18 @@ def download_and_convert(output_path: Path) -> None:
|
|||
# Rename columns to be descriptive
|
||||
rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
|
||||
wide = wide.rename(rename_map)
|
||||
return wide
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading ethnicity data...")
|
||||
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
df = pl.read_csv(response.content)
|
||||
print(f"Raw shape: {df.head(100)}")
|
||||
|
||||
wide = _ethnicity_percentages(df)
|
||||
|
||||
print(f"Output shape: {wide.shape}")
|
||||
print(f"Columns: {wide.columns}")
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
"""Download Defra Round 4 (2022) strategic noise data for England.
|
||||
|
||||
Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via
|
||||
WCS, then samples noise values at postcode centroids. Outputs a parquet file
|
||||
with postcode-level noise in dB for each source.
|
||||
WCS, then samples the local maximum around each postcode representative point.
|
||||
Outputs a parquet file with postcode-level noise in dB for each source.
|
||||
|
||||
Uses 100km tiles (~42 per source) to balance request size vs count. The server
|
||||
times out on tiles larger than ~150km at 100m resolution.
|
||||
Uses smaller 20km tiles at native 10m resolution so values are not understated
|
||||
by a single coarse centroid sample.
|
||||
|
||||
Data source: Defra Strategic Noise Mapping Round 4 (2022)
|
||||
- Lden = day-evening-night 24h weighted average (the EU standard metric)
|
||||
|
|
@ -17,6 +17,7 @@ endpoint is broken for that coverage).
|
|||
"""
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import tempfile
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
|
@ -27,8 +28,8 @@ import numpy as np
|
|||
import polars as pl
|
||||
import rasterio
|
||||
from pyproj import Transformer
|
||||
from rasterio.merge import merge
|
||||
from rasterio.transform import rowcol
|
||||
from scipy.ndimage import maximum_filter
|
||||
|
||||
# Noise sources:
|
||||
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
|
||||
|
|
@ -67,8 +68,9 @@ BNG_MAX_E = 660_000
|
|||
BNG_MIN_N = 0
|
||||
BNG_MAX_N = 660_000
|
||||
|
||||
# Tile size in metres (100km balances request size vs count; 300km causes 504s)
|
||||
TILE_SIZE = 100_000
|
||||
# Tile size in metres. At 10m resolution, 20km tiles are ~4M pixels each,
|
||||
# small enough to process one at a time without mosaicking all England.
|
||||
TILE_SIZE = 20_000
|
||||
|
||||
# Max concurrent tile downloads
|
||||
MAX_WORKERS = 4
|
||||
|
|
@ -76,19 +78,27 @@ MAX_WORKERS = 4
|
|||
# Native raster resolution (10m grid)
|
||||
NATIVE_RESOLUTION = 10
|
||||
|
||||
# Request pixel resolution in metres (100m is sufficient for postcode-level data
|
||||
# and keeps download size ~100x smaller than native 10m)
|
||||
RESOLUTION = 100
|
||||
# Request pixel resolution in metres.
|
||||
RESOLUTION = NATIVE_RESOLUTION
|
||||
|
||||
# The pipeline has postcode representative points rather than complete unit
|
||||
# polygons here. Use a small local footprint and take the maximum 10m cell so
|
||||
# postcode-level noise is not understated by centroid rounding.
|
||||
POSTCODE_NOISE_RADIUS_M = 50
|
||||
|
||||
# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
|
||||
# intermittently return 504s; smaller fallback requests usually succeed.
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BACKOFF_SECONDS = 5
|
||||
MIN_TILE_SIZE = 25_000
|
||||
MIN_TILE_SIZE = 5_000
|
||||
|
||||
type Tile = tuple[int, int, int, int]
|
||||
|
||||
|
||||
class NoGeoTiffError(RuntimeError):
|
||||
"""Raised when WCS returns an XML/HTML exception or another non-raster body."""
|
||||
|
||||
|
||||
def _wcs_get_coverage_url(
|
||||
wcs_base: str,
|
||||
coverage_id: str,
|
||||
|
|
@ -144,8 +154,8 @@ def _fetch_tile_bytes(
|
|||
max_e: int,
|
||||
max_n: int,
|
||||
wcs_version: str = "1.0.0",
|
||||
) -> bytes | None:
|
||||
"""Fetch one WCS tile. Returns None when the server reports no GeoTIFF."""
|
||||
) -> bytes:
|
||||
"""Fetch one WCS tile."""
|
||||
url = _wcs_get_coverage_url(
|
||||
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
||||
)
|
||||
|
|
@ -154,7 +164,11 @@ def _fetch_tile_bytes(
|
|||
resp.raise_for_status()
|
||||
|
||||
if not _looks_like_tiff(resp):
|
||||
return None
|
||||
content_type = resp.headers.get("content-type", "<missing>")
|
||||
body_preview = resp.text[:200].replace("\n", " ")
|
||||
raise NoGeoTiffError(
|
||||
f"WCS returned non-GeoTIFF response ({content_type}): {body_preview}"
|
||||
)
|
||||
return resp.content
|
||||
|
||||
|
||||
|
|
@ -200,11 +214,14 @@ def _download_tile(
|
|||
content = _fetch_tile_bytes(
|
||||
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
||||
)
|
||||
if content is None:
|
||||
return [], []
|
||||
tile_path.write_bytes(content)
|
||||
return [tile_path], []
|
||||
except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
|
||||
except (
|
||||
NoGeoTiffError,
|
||||
httpx.HTTPStatusError,
|
||||
httpx.TimeoutException,
|
||||
httpx.ConnectError,
|
||||
) as e:
|
||||
last_error = e
|
||||
if attempt < MAX_RETRIES:
|
||||
sleep_for = RETRY_BACKOFF_SECONDS * attempt
|
||||
|
|
@ -323,35 +340,62 @@ def sample_noise_at_postcodes(
|
|||
label: str,
|
||||
col_name: str,
|
||||
) -> pl.Series:
|
||||
"""Sample noise values from merged tiles at given BNG coordinates."""
|
||||
print(f"[{label}] Merging {len(tile_paths)} tiles...")
|
||||
datasets = [rasterio.open(p) for p in tile_paths]
|
||||
raster_nodata = datasets[0].nodata
|
||||
mosaic, mosaic_transform = merge(datasets)
|
||||
for ds in datasets:
|
||||
ds.close()
|
||||
|
||||
noise_grid = mosaic[0]
|
||||
|
||||
print(f"[{label}] Sampling noise values at postcode centroids...")
|
||||
rows, cols = rowcol(mosaic_transform, easting, northing)
|
||||
rows = np.asarray(rows)
|
||||
cols = np.asarray(cols)
|
||||
|
||||
h, w = noise_grid.shape
|
||||
in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
|
||||
|
||||
"""Sample max noise values from 10m tiles around postcode representative points."""
|
||||
print(f"[{label}] Sampling max noise values from {len(tile_paths)} tiles...")
|
||||
noise_db = np.full(len(easting), np.nan, dtype=np.float32)
|
||||
valid_rows = rows[in_bounds]
|
||||
valid_cols = cols[in_bounds]
|
||||
sampled = noise_grid[valid_rows, valid_cols].astype(np.float32)
|
||||
radius_cells = max(0, math.ceil(POSTCODE_NOISE_RADIUS_M / RESOLUTION))
|
||||
filter_size = radius_cells * 2 + 1
|
||||
|
||||
# Mark nodata and zero (unmapped areas) as NaN.
|
||||
# Road/rail use nodata=-96, airport uses nodata=3.4e38.
|
||||
if raster_nodata is not None:
|
||||
sampled[np.isclose(sampled, np.float32(raster_nodata), rtol=1e-5)] = np.nan
|
||||
sampled[sampled == 0] = np.nan
|
||||
noise_db[in_bounds] = sampled
|
||||
for path in tile_paths:
|
||||
with rasterio.open(path) as dataset:
|
||||
bounds = dataset.bounds
|
||||
candidate_mask = (
|
||||
(easting >= bounds.left - POSTCODE_NOISE_RADIUS_M)
|
||||
& (easting <= bounds.right + POSTCODE_NOISE_RADIUS_M)
|
||||
& (northing >= bounds.bottom - POSTCODE_NOISE_RADIUS_M)
|
||||
& (northing <= bounds.top + POSTCODE_NOISE_RADIUS_M)
|
||||
)
|
||||
candidate_indices = np.flatnonzero(candidate_mask)
|
||||
if len(candidate_indices) == 0:
|
||||
continue
|
||||
|
||||
grid = dataset.read(1).astype(np.float32, copy=False)
|
||||
invalid = ~np.isfinite(grid) | (grid == 0)
|
||||
if dataset.nodata is not None:
|
||||
invalid |= np.isclose(
|
||||
grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
|
||||
)
|
||||
grid = grid.copy()
|
||||
grid[invalid] = -np.inf
|
||||
if filter_size > 1:
|
||||
grid = maximum_filter(
|
||||
grid, size=filter_size, mode="constant", cval=-np.inf
|
||||
)
|
||||
|
||||
rows, cols = rowcol(
|
||||
dataset.transform,
|
||||
easting[candidate_indices],
|
||||
northing[candidate_indices],
|
||||
)
|
||||
rows = np.asarray(rows)
|
||||
cols = np.asarray(cols)
|
||||
h, w = grid.shape
|
||||
in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
|
||||
if not np.any(in_bounds):
|
||||
continue
|
||||
|
||||
sampled_indices = candidate_indices[in_bounds]
|
||||
sampled = grid[rows[in_bounds], cols[in_bounds]]
|
||||
valid = sampled != -np.inf
|
||||
if not np.any(valid):
|
||||
continue
|
||||
|
||||
sampled_indices = sampled_indices[valid]
|
||||
sampled = sampled[valid]
|
||||
existing = noise_db[sampled_indices]
|
||||
noise_db[sampled_indices] = np.where(
|
||||
np.isnan(existing), sampled, np.maximum(existing, sampled)
|
||||
)
|
||||
|
||||
valid_count = int(np.sum(~np.isnan(noise_db)))
|
||||
print(
|
||||
|
|
|
|||
|
|
@ -4,7 +4,9 @@ from tempfile import mkdtemp
|
|||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from shapely import make_valid
|
||||
from shapely.geometry import Point
|
||||
from shapely.wkb import loads as load_wkb
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.utils.england_geometry import (
|
||||
|
|
@ -31,6 +33,21 @@ POI_TAG_KEYS: list[str] = [
|
|||
"tourism",
|
||||
"public_transport",
|
||||
]
|
||||
AREA_BUILDING_CATEGORIES = {"building/church", "building/university"}
|
||||
|
||||
|
||||
def _representative_lat_lon(geom, england_polygon) -> tuple[float, float] | None:
|
||||
if geom.is_empty:
|
||||
return None
|
||||
if not geom.is_valid:
|
||||
geom = make_valid(geom)
|
||||
if geom.is_empty:
|
||||
return None
|
||||
point = geom.representative_point()
|
||||
lat, lon = point.y, point.x
|
||||
if not england_polygon.contains(Point(lon, lat)):
|
||||
return None
|
||||
return lat, lon
|
||||
|
||||
|
||||
class POIHandler(osmium.SimpleHandler):
|
||||
|
|
@ -42,6 +59,7 @@ class POIHandler(osmium.SimpleHandler):
|
|||
self.poi_count = 0
|
||||
self._progress = progress
|
||||
self._england = england_polygon
|
||||
self._wkb_factory = osmium.geom.WKBFactory()
|
||||
|
||||
def _in_england(self, lat: float, lon: float) -> bool:
|
||||
# Fast bbox pre-filter, then precise polygon check
|
||||
|
|
@ -52,8 +70,18 @@ class POIHandler(osmium.SimpleHandler):
|
|||
return False
|
||||
return self._england.contains(Point(lon, lat))
|
||||
|
||||
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
|
||||
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
|
||||
def _match_tags(
|
||||
self, tags: osmium.osm.TagList, *, polygonal: bool = False
|
||||
) -> list[str]:
|
||||
categories = [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
|
||||
if not polygonal:
|
||||
return categories
|
||||
return [
|
||||
category
|
||||
for category in categories
|
||||
if not category.startswith("building/")
|
||||
or category in AREA_BUILDING_CATEGORIES
|
||||
]
|
||||
|
||||
def _get_name(self, tags: osmium.osm.TagList) -> str:
|
||||
return tags.get("name:en", tags.get("name", ""))
|
||||
|
|
@ -89,6 +117,13 @@ class POIHandler(osmium.SimpleHandler):
|
|||
if len(self._batch) >= BATCH_SIZE:
|
||||
self._flush_batch()
|
||||
|
||||
def _point_from_area(self, area: osmium.osm.Area) -> tuple[float, float] | None:
|
||||
try:
|
||||
geom = load_wkb(self._wkb_factory.create_multipolygon(area), hex=True)
|
||||
except Exception:
|
||||
return None
|
||||
return _representative_lat_lon(geom, self._england)
|
||||
|
||||
def _tick(self) -> None:
|
||||
self._progress.update(1)
|
||||
|
||||
|
|
@ -103,6 +138,18 @@ class POIHandler(osmium.SimpleHandler):
|
|||
for category in categories:
|
||||
self._add_poi(f"n{n.id}", n.tags, category, lat, lon)
|
||||
|
||||
def area(self, a: osmium.osm.Area) -> None:
|
||||
self._tick()
|
||||
categories = self._match_tags(a.tags, polygonal=True)
|
||||
if not categories:
|
||||
return
|
||||
point = self._point_from_area(a)
|
||||
if point is None:
|
||||
return
|
||||
lat, lon = point
|
||||
for category in categories:
|
||||
self._add_poi(f"a{a.id}", a.tags, category, lat, lon)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
|
|
|
|||
|
|
@ -20,21 +20,27 @@ URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/dataset
|
|||
# Local authority district codes in England
|
||||
LA_PREFIXES = ("E06", "E07", "E08", "E09")
|
||||
|
||||
# ONS PIPR uses newer ONS codes for the 2026 South Yorkshire boundary/code
|
||||
# update while IoD 2025 still carries the predecessor codes. Duplicate rows
|
||||
# under the IoD codes so downstream joins are complete without inventing rents.
|
||||
AREA_CODE_ALIASES = {
|
||||
"E08000038": "E08000016", # Barnsley
|
||||
"E08000039": "E08000019", # Sheffield
|
||||
}
|
||||
|
||||
def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
|
||||
print("Reading PIPR Excel file (Table 1)...")
|
||||
|
||||
def _latest_rents_long(df: pl.DataFrame) -> pl.DataFrame:
|
||||
# Table 1 layout: row 0 = title, row 1 = column headers, row 2+ = data.
|
||||
# 40 columns in repeating blocks of 4 (index, monthly change, annual change,
|
||||
# rental price) for each category. Rental price columns (0-indexed):
|
||||
# 7 = All categories, 11 = One bed, 15 = Two bed, 19 = Three bed,
|
||||
# 23 = Four or more bed
|
||||
df = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False)
|
||||
df = df.slice(2) # Skip title and header rows
|
||||
|
||||
df = df.select(
|
||||
pl.col("column_1").alias("time_period"),
|
||||
pl.col("column_2").alias("area_code"),
|
||||
pl.col("column_3").alias("area_name"),
|
||||
pl.col("column_12").cast(pl.Float32, strict=False).alias("rent_1bed"),
|
||||
pl.col("column_16").cast(pl.Float32, strict=False).alias("rent_2bed"),
|
||||
pl.col("column_20").cast(pl.Float32, strict=False).alias("rent_3bed"),
|
||||
|
|
@ -65,12 +71,30 @@ def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
|
|||
frames.append(
|
||||
df.select(
|
||||
pl.col("area_code"),
|
||||
pl.col("area_name"),
|
||||
pl.col(col).alias("mean_monthly_rent"),
|
||||
pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
|
||||
)
|
||||
)
|
||||
|
||||
combined = pl.concat(frames)
|
||||
alias_rows = []
|
||||
for source_code, alias_code in AREA_CODE_ALIASES.items():
|
||||
alias_rows.append(
|
||||
combined.filter(pl.col("area_code") == source_code).with_columns(
|
||||
pl.lit(alias_code).alias("area_code")
|
||||
)
|
||||
)
|
||||
if alias_rows:
|
||||
combined = pl.concat([combined, *alias_rows])
|
||||
|
||||
return combined.unique(["area_code", "bedrooms"], keep="first")
|
||||
|
||||
|
||||
def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
|
||||
print("Reading PIPR Excel file (Table 1)...")
|
||||
raw = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False)
|
||||
combined = _latest_rents_long(raw)
|
||||
|
||||
print(f"Combined: {combined.shape}")
|
||||
print(f"Non-null rents: {combined['mean_monthly_rent'].drop_nulls().len()}")
|
||||
|
|
|
|||
22
pipeline/download/test_election_results.py
Normal file
22
pipeline/download/test_election_results.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.download.election_results import _convert_results
|
||||
|
||||
|
||||
def test_convert_results_fills_parties_that_did_not_stand_with_zero():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"Constituency geographic code": ["E14000001", "E14000001"],
|
||||
"Main party name": ["Labour", "Conservative"],
|
||||
"Candidate result position": [1, 2],
|
||||
"Election valid vote count": [1000, 1000],
|
||||
"Electorate": [2000, 2000],
|
||||
"Candidate vote count": [600, 400],
|
||||
}
|
||||
)
|
||||
|
||||
result = _convert_results(raw)
|
||||
|
||||
assert result.select("% Labour", "% Conservative", "% Reform UK").to_dicts() == [
|
||||
{"% Labour": 60.0, "% Conservative": 40.0, "% Reform UK": 0.0}
|
||||
]
|
||||
37
pipeline/download/test_ethnicity.py
Normal file
37
pipeline/download/test_ethnicity.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.download.ethnicity import _ethnicity_percentages
|
||||
|
||||
|
||||
def test_ethnicity_percentages_recombines_predecessor_lads_by_population():
|
||||
rows = []
|
||||
for code, white, indian in [
|
||||
("E07000026", 80, 20),
|
||||
("E07000028", 10, 90),
|
||||
]:
|
||||
total = white + indian
|
||||
rows.extend(
|
||||
[
|
||||
{
|
||||
"Geography_code": code,
|
||||
"Ethnicity_type": "ONS 2021 19+1",
|
||||
"Ethnicity": "White British",
|
||||
"Ethnic Population": white,
|
||||
"Value1": white / total * 100,
|
||||
},
|
||||
{
|
||||
"Geography_code": code,
|
||||
"Ethnicity_type": "ONS 2021 19+1",
|
||||
"Ethnicity": "Indian",
|
||||
"Ethnic Population": indian,
|
||||
"Value1": indian / total * 100,
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
result = _ethnicity_percentages(pl.DataFrame(rows))
|
||||
|
||||
cumberland = result.filter(pl.col("Geography_code") == "E06000063")
|
||||
assert cumberland.select("% White", "% South Asian").to_dicts() == [
|
||||
{"% White": 45.0, "% South Asian": 55.0}
|
||||
]
|
||||
|
|
@ -1,5 +1,8 @@
|
|||
import httpx
|
||||
import numpy as np
|
||||
import pytest
|
||||
import rasterio
|
||||
from rasterio.transform import from_origin
|
||||
|
||||
from pipeline.download import noise
|
||||
|
||||
|
|
@ -50,6 +53,21 @@ def test_download_tile_reports_unsplittable_failure(monkeypatch, tmp_path):
|
|||
assert failures == [(0, 0, 100, 100)]
|
||||
|
||||
|
||||
def test_download_tile_treats_non_tiff_response_as_failure(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(noise, "MAX_RETRIES", 1)
|
||||
monkeypatch.setattr(noise, "MIN_TILE_SIZE", 100)
|
||||
|
||||
def fake_fetch_tile_bytes(*args, **kwargs):
|
||||
raise noise.NoGeoTiffError("xml exception")
|
||||
|
||||
monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
|
||||
|
||||
paths, failures = noise._download_tile("base", "coverage", 0, 0, 100, 100, tmp_path)
|
||||
|
||||
assert paths == []
|
||||
assert failures == [(0, 0, 100, 100)]
|
||||
|
||||
|
||||
def test_download_raster_tolerates_missing_tiles_when_allowed(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(noise, "BNG_MIN_E", 0)
|
||||
monkeypatch.setattr(noise, "BNG_MAX_E", 100)
|
||||
|
|
@ -87,3 +105,42 @@ def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):
|
|||
|
||||
with pytest.raises(RuntimeError, match=r"\[Road\] Failed to download"):
|
||||
noise.download_raster(tmp_path, "base", "coverage", "Road")
|
||||
|
||||
|
||||
def test_sample_noise_at_postcodes_uses_local_maximum(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
|
||||
monkeypatch.setattr(noise, "RESOLUTION", 10)
|
||||
tile_path = tmp_path / "noise.tif"
|
||||
data = np.array(
|
||||
[
|
||||
[0, 0, 0, 0, 0],
|
||||
[0, 70, 0, 0, 0],
|
||||
[0, 0, 55, 0, 0],
|
||||
[0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
with rasterio.open(
|
||||
tile_path,
|
||||
"w",
|
||||
driver="GTiff",
|
||||
height=data.shape[0],
|
||||
width=data.shape[1],
|
||||
count=1,
|
||||
dtype=data.dtype,
|
||||
crs="EPSG:27700",
|
||||
transform=from_origin(0, 50, 10, 10),
|
||||
nodata=0,
|
||||
) as dataset:
|
||||
dataset.write(data, 1)
|
||||
|
||||
result = noise.sample_noise_at_postcodes(
|
||||
[tile_path],
|
||||
easting=np.array([25.0]),
|
||||
northing=np.array([25.0]),
|
||||
label="Road",
|
||||
col_name="road_noise_lden_db",
|
||||
)
|
||||
|
||||
assert result.to_list() == [70.0]
|
||||
|
|
|
|||
15
pipeline/download/test_pois.py
Normal file
15
pipeline/download/test_pois.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
from shapely.geometry import Polygon, box
|
||||
|
||||
from pipeline.download.pois import _representative_lat_lon
|
||||
|
||||
|
||||
def test_representative_lat_lon_uses_point_inside_polygon():
|
||||
england = box(-1, 50, 1, 52)
|
||||
poi_area = Polygon([(-0.1, 51.5), (0.1, 51.5), (0.1, 51.6), (-0.1, 51.6)])
|
||||
|
||||
lat_lon = _representative_lat_lon(poi_area, england)
|
||||
|
||||
assert lat_lon is not None
|
||||
lat, lon = lat_lon
|
||||
assert 51.5 <= lat <= 51.6
|
||||
assert -0.1 <= lon <= 0.1
|
||||
24
pipeline/download/test_rental_prices.py
Normal file
24
pipeline/download/test_rental_prices.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.download.rental_prices import _latest_rents_long
|
||||
|
||||
|
||||
def test_latest_rents_long_adds_iod_alias_codes_for_south_yorkshire():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"column_1": ["title", "header", "2026-02-01 00:00:00"],
|
||||
"column_2": ["", "", "E08000038"],
|
||||
"column_3": ["", "", "Barnsley"],
|
||||
"column_12": ["", "", "486"],
|
||||
"column_16": ["", "", "595"],
|
||||
"column_20": ["", "", "705"],
|
||||
"column_24": ["", "", "900"],
|
||||
}
|
||||
)
|
||||
|
||||
result = _latest_rents_long(raw).filter(pl.col("bedrooms") == 1).sort("area_code")
|
||||
|
||||
assert result.select("area_code", "mean_monthly_rent").to_dicts() == [
|
||||
{"area_code": "E08000016", "mean_monthly_rent": 486.0},
|
||||
{"area_code": "E08000038", "mean_monthly_rent": 486.0},
|
||||
]
|
||||
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
|
||||
MONTH_RE = r"^\d{4}-\d{2}$"
|
||||
|
||||
|
||||
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
||||
|
|
@ -37,16 +38,45 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|||
},
|
||||
).select("LSOA code", "Crime type", "Month")
|
||||
|
||||
# Extract year, count crimes per LSOA / year / crime type
|
||||
valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
|
||||
valid_months = (
|
||||
df.filter(valid_month_expr)
|
||||
.select("Month")
|
||||
.unique()
|
||||
.collect(engine="streaming")["Month"]
|
||||
.sort()
|
||||
.to_list()
|
||||
)
|
||||
if not valid_months:
|
||||
raise ValueError(f"No valid crime months found in {crime_dir}")
|
||||
|
||||
valid_month_count = len(valid_months)
|
||||
print(
|
||||
f"Using {valid_month_count} valid data months "
|
||||
f"({valid_months[0]} to {valid_months[-1]})"
|
||||
)
|
||||
|
||||
# Count monthly incidents, then annualise over every valid month in the dataset.
|
||||
yearly_counts = (
|
||||
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
|
||||
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
|
||||
.group_by("LSOA code", "year", "Crime type")
|
||||
df.filter(
|
||||
valid_month_expr
|
||||
& pl.col("LSOA code").is_not_null()
|
||||
& (pl.col("LSOA code") != "")
|
||||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
)
|
||||
.group_by("LSOA code", "Month", "Crime type")
|
||||
.agg(pl.len().alias("count"))
|
||||
.group_by("LSOA code", "Crime type")
|
||||
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
|
||||
.agg(
|
||||
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
|
||||
.round(1)
|
||||
.alias("yearly_avg")
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
if yearly_counts.is_empty():
|
||||
raise ValueError(f"No valid crime rows found in {crime_dir}")
|
||||
|
||||
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
|
||||
|
||||
|
|
|
|||
94
pipeline/transform/price_estimation/test_knn.py
Normal file
94
pipeline/transform/price_estimation/test_knn.py
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
from datetime import date
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
|
||||
from pipeline.transform.price_estimation.knn import build_knn_pool, knn_median_psm
|
||||
from pipeline.transform.price_estimation.utils import TYPE_GROUPS, type_group_expr
|
||||
|
||||
|
||||
def _flat_index() -> pl.DataFrame:
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"sector": ["AA1 1", "AA1 1"],
|
||||
"type_group": ["Detached", "All"],
|
||||
"year": [2026, 2026],
|
||||
"log_index": [0.0, 0.0],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_knn_excludes_same_sale_and_uses_stable_comparables():
|
||||
sale_date = date(2026, 1, 1)
|
||||
rows = [
|
||||
{
|
||||
"Postcode": "AA1 1AA",
|
||||
"Property type": "Detached",
|
||||
"lat": 51.5000,
|
||||
"lon": -0.1000,
|
||||
"Total floor area (sqm)": 80.0,
|
||||
"Last known price": 900_000.0,
|
||||
"Date of last transaction": sale_date,
|
||||
}
|
||||
]
|
||||
rows.extend(
|
||||
{
|
||||
"Postcode": "AA1 1AA",
|
||||
"Property type": "Detached",
|
||||
"lat": 51.5001 + i * 0.00001,
|
||||
"lon": -0.1001,
|
||||
"Total floor area (sqm)": 20.0,
|
||||
"Last known price": 900_000.0,
|
||||
"Date of last transaction": sale_date,
|
||||
}
|
||||
for i in range(5)
|
||||
)
|
||||
rows.extend(
|
||||
{
|
||||
"Postcode": f"AA1 1B{i}",
|
||||
"Property type": "Detached",
|
||||
"lat": 51.5010 + i * 0.00001,
|
||||
"lon": -0.1010,
|
||||
"Total floor area (sqm)": 80.0,
|
||||
"Last known price": 200_000.0,
|
||||
"Date of last transaction": sale_date,
|
||||
}
|
||||
for i in range(5)
|
||||
)
|
||||
df = pl.DataFrame(rows)
|
||||
|
||||
trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
|
||||
psm = knn_median_psm(
|
||||
trees,
|
||||
lat=np.array([51.5000]),
|
||||
lon=np.array([-0.1000]),
|
||||
type_groups=np.array(["Detached"]),
|
||||
postcodes=np.array(["AA1 1AA"]),
|
||||
last_prices=np.array([900_000.0]),
|
||||
last_sale_dates=np.array(
|
||||
[sale_date.toordinal() - date(1970, 1, 1).toordinal()]
|
||||
),
|
||||
)
|
||||
|
||||
assert psm[0] == 2_500.0
|
||||
|
||||
|
||||
def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
|
||||
blended = guarded_blend_estimates(
|
||||
index_est=np.array([120_000.0, 1_000_000.0]),
|
||||
knn_est=np.array([5_000_000.0, 1_000_000.0]),
|
||||
last_prices=np.array([100_000.0, 100_000.0]),
|
||||
)
|
||||
|
||||
assert blended[0] == 120_000.0
|
||||
assert blended[1] == 600_000.0
|
||||
|
||||
|
||||
def test_bungalow_is_not_a_dead_price_index_type_group():
|
||||
df = pl.DataFrame({"Property type": ["Bungalow", "Other"]}).with_columns(
|
||||
type_group_expr()
|
||||
)
|
||||
|
||||
assert "Bungalow" not in TYPE_GROUPS
|
||||
assert df["type_group"].to_list() == [None, None]
|
||||
|
|
@ -44,4 +44,76 @@ def test_transform_crime_reads_only_street_crime_csvs(tmp_path):
|
|||
|
||||
result = pl.read_parquet(output).to_dicts()
|
||||
|
||||
assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 2.0}]
|
||||
assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 24.0}]
|
||||
|
||||
|
||||
def test_transform_crime_annualises_over_all_valid_months(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
jan_dir = crime_dir / "2024-01"
|
||||
feb_dir = crime_dir / "2024-02"
|
||||
jan_dir.mkdir(parents=True)
|
||||
feb_dir.mkdir(parents=True)
|
||||
|
||||
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
(jan_dir / "2024-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
|
||||
"2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
|
||||
"3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
(feb_dir / "2024-02-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"4,2024-02,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
transform_crime(crime_dir, output)
|
||||
|
||||
result = pl.read_parquet(output).sort("LSOA code").to_dicts()
|
||||
|
||||
assert result == [
|
||||
{
|
||||
"LSOA code": "E01000001",
|
||||
"Burglary (avg/yr)": 12.0,
|
||||
"Robbery (avg/yr)": 0.0,
|
||||
},
|
||||
{
|
||||
"LSOA code": "E01000002",
|
||||
"Burglary (avg/yr)": 0.0,
|
||||
"Robbery (avg/yr)": 12.0,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_transform_crime_fails_without_valid_months(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2024-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
(month_dir / "2024-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
|
||||
"1,,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
|
||||
try:
|
||||
transform_crime(crime_dir, output)
|
||||
except ValueError as exc:
|
||||
assert "No valid crime months" in str(exc)
|
||||
else:
|
||||
raise AssertionError("Expected ValueError")
|
||||
|
|
|
|||
|
|
@ -136,17 +136,17 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000],
|
||||
"date_of_transfer": [date(2024, 2, 3)],
|
||||
"property_type": ["T"],
|
||||
"postcode": ["AA1 1AA"],
|
||||
"paon": ["1"],
|
||||
"saon": [None],
|
||||
"street": ["Example Street"],
|
||||
"locality": [None],
|
||||
"town_city": ["Exampletown"],
|
||||
"duration": ["F"],
|
||||
"old_new": ["N"],
|
||||
"price": [200_000, 250_000],
|
||||
"date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)],
|
||||
"property_type": ["T", "T"],
|
||||
"postcode": ["AA1 1AA", "AA1 1AA"],
|
||||
"paon": ["1", "1"],
|
||||
"saon": [None, None],
|
||||
"street": ["Example-Street", "Example Street"],
|
||||
"locality": [None, None],
|
||||
"town_city": ["Exampletown", "Exampletown"],
|
||||
"duration": ["F", "F"],
|
||||
"old_new": ["N", "N"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
|
|
@ -172,3 +172,85 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
}
|
||||
]
|
||||
assert df.get_column("renovation_history").list.len().to_list() == [1]
|
||||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
|
||||
|
||||
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerow(_row())
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000, 300_000],
|
||||
"date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)],
|
||||
"property_type": ["T", "T"],
|
||||
"postcode": ["AA1 1AA", ""],
|
||||
"paon": ["1", "2"],
|
||||
"saon": [None, None],
|
||||
"street": ["Example Street", "Example Street"],
|
||||
"locality": [None, None],
|
||||
"town_city": ["Exampletown", "Exampletown"],
|
||||
"duration": ["F", "F"],
|
||||
"old_new": ["N", "N"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df["postcode"].to_list() == ["AA1 1AA"]
|
||||
|
||||
|
||||
def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path):
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerow(_row(address="1 Totally Different Road"))
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000],
|
||||
"date_of_transfer": [date(2024, 2, 3)],
|
||||
"property_type": ["T"],
|
||||
"postcode": ["AA1 1AA"],
|
||||
"paon": ["1"],
|
||||
"saon": [None],
|
||||
"street": ["Example Street"],
|
||||
"locality": [None],
|
||||
"town_city": ["Exampletown"],
|
||||
"duration": ["F"],
|
||||
"old_new": ["N"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
assert df.select(
|
||||
"pp_address",
|
||||
"epc_address",
|
||||
"total_floor_area",
|
||||
"current_energy_rating",
|
||||
).to_dicts() == [
|
||||
{
|
||||
"pp_address": "1 Example Street",
|
||||
"epc_address": None,
|
||||
"total_floor_area": None,
|
||||
"current_energy_rating": None,
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,8 +1,14 @@
|
|||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_AREA_COLUMNS,
|
||||
TREE_DENSITY_FEATURE,
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
_tree_density_by_postcode,
|
||||
_validate_lad_source_coverage,
|
||||
_validate_property_postcodes,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -36,3 +42,103 @@ def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
|||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
|
||||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
|
||||
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
||||
|
||||
|
||||
def test_country_code_is_kept_in_postcode_area_columns() -> None:
|
||||
assert "ctry25cd" in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_validate_property_postcodes_rejects_blank_rows() -> None:
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", ""],
|
||||
"Address per Property Register": ["1 Example Street", "2 Example Street"],
|
||||
"Last known price": [100_000, 200_000],
|
||||
}
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Property rows missing a postcode"):
|
||||
_validate_property_postcodes(df)
|
||||
|
||||
|
||||
def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"Local Authority District code (2024)": [
|
||||
"E08000016",
|
||||
"E06000053",
|
||||
"E09000001",
|
||||
],
|
||||
"Local Authority District name (2024)": [
|
||||
"Barnsley",
|
||||
"Isles of Scilly",
|
||||
"City of London",
|
||||
],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame(
|
||||
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
|
||||
).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
|
||||
|
||||
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"Local Authority District code (2024)": ["E08000016"],
|
||||
"Local Authority District name (2024)": ["Barnsley"],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Rental data is missing"):
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
|
||||
|
||||
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
|
||||
path = tmp_path / "tree_density_by_postcode.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
"Tree canopy density percentile within 50m": [12.5, 99.0],
|
||||
}
|
||||
).write_parquet(path)
|
||||
|
||||
result = _tree_density_by_postcode(path).collect().sort("postcode")
|
||||
|
||||
assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
|
||||
assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
|
||||
assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
|
||||
|
||||
|
||||
def test_tree_density_by_postcode_requires_postcode_and_density_columns(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
path = tmp_path / "tree_density_by_postcode.parquet"
|
||||
pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
|
||||
|
||||
with pytest.raises(ValueError, match="must contain column"):
|
||||
_tree_density_by_postcode(path)
|
||||
|
||||
missing_postcode_path = tmp_path / "missing_postcode.parquet"
|
||||
pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
|
||||
missing_postcode_path
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="missing required column: postcode"):
|
||||
_tree_density_by_postcode(missing_postcode_path)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
from .download import download, extract_zip
|
||||
from .fuzzy_join import fuzzy_join_on_postcode
|
||||
from .fuzzy_join import (
|
||||
fuzzy_join_on_postcode,
|
||||
normalize_address_key,
|
||||
normalize_postcode_key,
|
||||
)
|
||||
from .haversine import haversine_km, haversine_km_expr
|
||||
from .poi_counts import count_pois_per_postcode
|
||||
from .postcode_mapping import build_postcode_mapping
|
||||
|
|
@ -8,6 +12,8 @@ __all__ = [
|
|||
"download",
|
||||
"extract_zip",
|
||||
"fuzzy_join_on_postcode",
|
||||
"normalize_address_key",
|
||||
"normalize_postcode_key",
|
||||
"haversine_km",
|
||||
"haversine_km_expr",
|
||||
"count_pois_per_postcode",
|
||||
|
|
|
|||
|
|
@ -10,15 +10,31 @@ from thefuzz import fuzz
|
|||
from tqdm import tqdm
|
||||
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
_POSTCODE_RE = r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"
|
||||
MIN_FUZZY_SCORE = 60
|
||||
|
||||
|
||||
def _normalize(s: pl.Expr) -> pl.Expr:
|
||||
return (
|
||||
s.str.to_uppercase()
|
||||
.str.replace_all(r"[,.\-]", " ")
|
||||
def normalize_address_key(s: pl.Expr) -> pl.Expr:
|
||||
normalized = (
|
||||
s.cast(pl.String)
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^0-9A-Z]+", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
return pl.when(normalized.str.contains(r"[A-Z]")).then(normalized).otherwise(None)
|
||||
|
||||
|
||||
def normalize_postcode_key(s: pl.Expr) -> pl.Expr:
|
||||
normalized = (
|
||||
s.cast(pl.String)
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^A-Z0-9]+", "")
|
||||
.str.strip_chars()
|
||||
)
|
||||
return (
|
||||
pl.when(normalized.str.contains(_POSTCODE_RE)).then(normalized).otherwise(None)
|
||||
)
|
||||
|
||||
|
||||
def fuzzy_join_on_postcode(
|
||||
|
|
@ -28,6 +44,7 @@ def fuzzy_join_on_postcode(
|
|||
right_address_col: str,
|
||||
left_postcode_col: str,
|
||||
right_postcode_col: str,
|
||||
min_score: int = MIN_FUZZY_SCORE,
|
||||
) -> pl.LazyFrame:
|
||||
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
|
||||
|
||||
|
|
@ -54,11 +71,10 @@ def fuzzy_join_on_postcode(
|
|||
pl.scan_parquet(left_path)
|
||||
.select(
|
||||
"_left_idx",
|
||||
_normalize(pl.col(left_address_col)).alias("_left_address"),
|
||||
pl.col(left_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_left_postcode"),
|
||||
normalize_address_key(pl.col(left_address_col)).alias("_left_address"),
|
||||
normalize_postcode_key(pl.col(left_postcode_col)).alias(
|
||||
"_left_postcode"
|
||||
),
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
|
@ -67,11 +83,12 @@ def fuzzy_join_on_postcode(
|
|||
pl.scan_parquet(right_path)
|
||||
.select(
|
||||
"_right_idx",
|
||||
_normalize(pl.col(right_address_col)).alias("_right_address"),
|
||||
pl.col(right_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_right_postcode"),
|
||||
normalize_address_key(pl.col(right_address_col)).alias(
|
||||
"_right_address"
|
||||
),
|
||||
normalize_postcode_key(pl.col(right_postcode_col)).alias(
|
||||
"_right_postcode"
|
||||
),
|
||||
)
|
||||
.unique(subset=["_right_address", "_right_postcode"], keep="first")
|
||||
.collect(engine="streaming")
|
||||
|
|
@ -101,7 +118,7 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
# Build tasks for each postcode bucket
|
||||
tasks = [
|
||||
(left_entries, right_by_postcode[postcode])
|
||||
(left_entries, right_by_postcode[postcode], min_score)
|
||||
for postcode, left_entries in left_by_postcode.items()
|
||||
if postcode in right_by_postcode
|
||||
]
|
||||
|
|
@ -182,15 +199,16 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
|||
|
||||
|
||||
def _score_bucket(
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]]],
|
||||
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
|
||||
) -> list[tuple[int, int, int]]:
|
||||
"""Score all address pairs within a single postcode bucket."""
|
||||
left_entries, right_entries = args
|
||||
left_entries, right_entries, min_score = args
|
||||
pairs = []
|
||||
for left_row, left_address in left_entries:
|
||||
for right_row, right_address in right_entries:
|
||||
if not _numbers_compatible(left_address, right_address):
|
||||
continue
|
||||
score = fuzz.token_sort_ratio(left_address, right_address)
|
||||
pairs.append((score, left_row, right_row))
|
||||
if score >= min_score:
|
||||
pairs.append((score, left_row, right_row))
|
||||
return pairs
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.utils import fuzzy_join_on_postcode
|
||||
from pipeline.utils import fuzzy_join_on_postcode, normalize_postcode_key
|
||||
|
||||
|
||||
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
|
||||
|
|
@ -132,3 +132,22 @@ def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
|
|||
{"left_id": "number_only", "right_address": None},
|
||||
{"left_id": "valid", "right_address": "10 High Street"},
|
||||
]
|
||||
|
||||
|
||||
def test_normalize_postcode_key_requires_full_postcode():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"postcode": [
|
||||
" SW1A 1AA ",
|
||||
"sw1a-1aa",
|
||||
"",
|
||||
"SW1A",
|
||||
"12345",
|
||||
"not a postcode",
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
result = df.select(normalize_postcode_key(pl.col("postcode")).alias("key"))
|
||||
|
||||
assert result["key"].to_list() == ["SW1A1AA", "SW1A1AA", None, None, None, None]
|
||||
|
|
|
|||
152
pipeline/validate_outputs.py
Normal file
152
pipeline/validate_outputs.py
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
"""Validate pipeline outputs before Make stamps are touched."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
|
||||
def _failures_for_file(path: Path) -> list[str]:
|
||||
if not path.exists():
|
||||
return [f"{path}: missing"]
|
||||
if not path.is_file():
|
||||
return [f"{path}: not a file"]
|
||||
try:
|
||||
size = path.stat().st_size
|
||||
except OSError as exc:
|
||||
return [f"{path}: unreadable metadata: {exc}"]
|
||||
if size <= 0:
|
||||
return [f"{path}: empty file"]
|
||||
return []
|
||||
|
||||
|
||||
def _failures_for_dir(path: Path) -> list[str]:
|
||||
if not path.exists():
|
||||
return [f"{path}: missing"]
|
||||
if not path.is_dir():
|
||||
return [f"{path}: not a directory"]
|
||||
try:
|
||||
if not any(not child.name.startswith(".") for child in path.iterdir()):
|
||||
return [f"{path}: empty directory"]
|
||||
except OSError as exc:
|
||||
return [f"{path}: unreadable directory: {exc}"]
|
||||
return []
|
||||
|
||||
|
||||
def _failures_for_parquet(path: Path) -> list[str]:
|
||||
failures = _failures_for_file(path)
|
||||
if failures:
|
||||
return failures
|
||||
try:
|
||||
row_count = pl.scan_parquet(path).select(pl.len()).collect().item()
|
||||
except Exception as exc:
|
||||
return [f"{path}: unreadable parquet: {exc}"]
|
||||
if row_count <= 0:
|
||||
return [f"{path}: parquet has no rows"]
|
||||
return []
|
||||
|
||||
|
||||
def _failures_for_zip(path: Path) -> list[str]:
|
||||
failures = _failures_for_file(path)
|
||||
if failures:
|
||||
return failures
|
||||
if not zipfile.is_zipfile(path):
|
||||
return [f"{path}: unreadable zip"]
|
||||
try:
|
||||
with zipfile.ZipFile(path) as archive:
|
||||
if not archive.namelist():
|
||||
return [f"{path}: zip has no members"]
|
||||
except Exception as exc:
|
||||
return [f"{path}: unreadable zip: {exc}"]
|
||||
return []
|
||||
|
||||
|
||||
def _split_glob(spec: str) -> tuple[Path, str]:
|
||||
if "::" not in spec:
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"{spec!r} must use BASE::PATTERN, for example data::**/*.csv"
|
||||
)
|
||||
base, pattern = spec.split("::", 1)
|
||||
if not base or not pattern:
|
||||
raise argparse.ArgumentTypeError(f"{spec!r} must include BASE and PATTERN")
|
||||
return Path(base), pattern
|
||||
|
||||
|
||||
def _matched_files(spec: str) -> tuple[Path, str, list[Path]]:
|
||||
base, pattern = _split_glob(spec)
|
||||
if not base.exists():
|
||||
return base, pattern, []
|
||||
return base, pattern, sorted(path for path in base.glob(pattern) if path.is_file())
|
||||
|
||||
|
||||
def _failures_for_glob(spec: str) -> list[str]:
|
||||
base, pattern, paths = _matched_files(spec)
|
||||
if not paths:
|
||||
return [f"{base}: no files matched {pattern!r}"]
|
||||
|
||||
failures: list[str] = []
|
||||
for path in paths:
|
||||
failures.extend(_failures_for_file(path))
|
||||
return failures
|
||||
|
||||
|
||||
def _failures_for_zip_glob(spec: str) -> list[str]:
|
||||
base, pattern, paths = _matched_files(spec)
|
||||
if not paths:
|
||||
return [f"{base}: no zip files matched {pattern!r}"]
|
||||
|
||||
failures: list[str] = []
|
||||
for path in paths:
|
||||
failures.extend(_failures_for_zip(path))
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--file", action="append", default=[], type=Path)
|
||||
parser.add_argument("--dir", action="append", default=[], type=Path)
|
||||
parser.add_argument("--parquet", action="append", default=[], type=Path)
|
||||
parser.add_argument("--zip", action="append", default=[], type=Path)
|
||||
parser.add_argument(
|
||||
"--glob",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Require at least one non-empty file matching BASE::PATTERN",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zip-glob",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Require at least one readable zip matching BASE::PATTERN",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
failures: list[str] = []
|
||||
for path in args.file:
|
||||
failures.extend(_failures_for_file(path))
|
||||
for path in args.dir:
|
||||
failures.extend(_failures_for_dir(path))
|
||||
for path in args.parquet:
|
||||
failures.extend(_failures_for_parquet(path))
|
||||
for path in args.zip:
|
||||
failures.extend(_failures_for_zip(path))
|
||||
for spec in args.glob:
|
||||
failures.extend(_failures_for_glob(spec))
|
||||
for spec in args.zip_glob:
|
||||
failures.extend(_failures_for_zip_glob(spec))
|
||||
|
||||
if failures:
|
||||
print("Output validation failed:", file=sys.stderr)
|
||||
for failure in failures:
|
||||
print(f" - {failure}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue