This commit is contained in:
Andras Schmelczer 2026-05-14 08:09:19 +01:00
parent a8165249a4
commit a4103b0896
64 changed files with 5376 additions and 3832 deletions

View file

@ -5,6 +5,7 @@ from pathlib import Path
import polars as pl
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
MONTH_RE = r"^\d{4}-\d{2}$"
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
@ -37,16 +38,45 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
},
).select("LSOA code", "Crime type", "Month")
# Extract year, count crimes per LSOA / year / crime type
valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
valid_months = (
df.filter(valid_month_expr)
.select("Month")
.unique()
.collect(engine="streaming")["Month"]
.sort()
.to_list()
)
if not valid_months:
raise ValueError(f"No valid crime months found in {crime_dir}")
valid_month_count = len(valid_months)
print(
f"Using {valid_month_count} valid data months "
f"({valid_months[0]} to {valid_months[-1]})"
)
# Count monthly incidents, then annualise over every valid month in the dataset.
yearly_counts = (
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
.group_by("LSOA code", "year", "Crime type")
df.filter(
valid_month_expr
& pl.col("LSOA code").is_not_null()
& (pl.col("LSOA code") != "")
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
)
.group_by("LSOA code", "Month", "Crime type")
.agg(pl.len().alias("count"))
.group_by("LSOA code", "Crime type")
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
.agg(
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
.round(1)
.alias("yearly_avg")
)
.collect(engine="streaming")
)
if yearly_counts.is_empty():
raise ValueError(f"No valid crime rows found in {crime_dir}")
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")

View file

@ -0,0 +1,94 @@
from datetime import date
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
from pipeline.transform.price_estimation.knn import build_knn_pool, knn_median_psm
from pipeline.transform.price_estimation.utils import TYPE_GROUPS, type_group_expr
def _flat_index() -> pl.DataFrame:
return pl.DataFrame(
{
"sector": ["AA1 1", "AA1 1"],
"type_group": ["Detached", "All"],
"year": [2026, 2026],
"log_index": [0.0, 0.0],
}
)
def test_knn_excludes_same_sale_and_uses_stable_comparables():
sale_date = date(2026, 1, 1)
rows = [
{
"Postcode": "AA1 1AA",
"Property type": "Detached",
"lat": 51.5000,
"lon": -0.1000,
"Total floor area (sqm)": 80.0,
"Last known price": 900_000.0,
"Date of last transaction": sale_date,
}
]
rows.extend(
{
"Postcode": "AA1 1AA",
"Property type": "Detached",
"lat": 51.5001 + i * 0.00001,
"lon": -0.1001,
"Total floor area (sqm)": 20.0,
"Last known price": 900_000.0,
"Date of last transaction": sale_date,
}
for i in range(5)
)
rows.extend(
{
"Postcode": f"AA1 1B{i}",
"Property type": "Detached",
"lat": 51.5010 + i * 0.00001,
"lon": -0.1010,
"Total floor area (sqm)": 80.0,
"Last known price": 200_000.0,
"Date of last transaction": sale_date,
}
for i in range(5)
)
df = pl.DataFrame(rows)
trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
psm = knn_median_psm(
trees,
lat=np.array([51.5000]),
lon=np.array([-0.1000]),
type_groups=np.array(["Detached"]),
postcodes=np.array(["AA1 1AA"]),
last_prices=np.array([900_000.0]),
last_sale_dates=np.array(
[sale_date.toordinal() - date(1970, 1, 1).toordinal()]
),
)
assert psm[0] == 2_500.0
def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
blended = guarded_blend_estimates(
index_est=np.array([120_000.0, 1_000_000.0]),
knn_est=np.array([5_000_000.0, 1_000_000.0]),
last_prices=np.array([100_000.0, 100_000.0]),
)
assert blended[0] == 120_000.0
assert blended[1] == 600_000.0
def test_bungalow_is_not_a_dead_price_index_type_group():
df = pl.DataFrame({"Property type": ["Bungalow", "Other"]}).with_columns(
type_group_expr()
)
assert "Bungalow" not in TYPE_GROUPS
assert df["type_group"].to_list() == [None, None]

View file

@ -44,4 +44,76 @@ def test_transform_crime_reads_only_street_crime_csvs(tmp_path):
result = pl.read_parquet(output).to_dicts()
assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 2.0}]
assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 24.0}]
def test_transform_crime_annualises_over_all_valid_months(tmp_path):
crime_dir = tmp_path / "crime"
jan_dir = crime_dir / "2024-01"
feb_dir = crime_dir / "2024-02"
jan_dir.mkdir(parents=True)
feb_dir.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
(jan_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
]
)
+ "\n"
)
(feb_dir / "2024-02-test-force-street.csv").write_text(
"\n".join(
[
header,
"4,2024-02,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
]
)
+ "\n"
)
output = tmp_path / "crime.parquet"
transform_crime(crime_dir, output)
result = pl.read_parquet(output).sort("LSOA code").to_dicts()
assert result == [
{
"LSOA code": "E01000001",
"Burglary (avg/yr)": 12.0,
"Robbery (avg/yr)": 0.0,
},
{
"LSOA code": "E01000002",
"Burglary (avg/yr)": 0.0,
"Robbery (avg/yr)": 12.0,
},
]
def test_transform_crime_fails_without_valid_months(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
(month_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
"1,,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
]
)
+ "\n"
)
output = tmp_path / "crime.parquet"
try:
transform_crime(crime_dir, output)
except ValueError as exc:
assert "No valid crime months" in str(exc)
else:
raise AssertionError("Expected ValueError")

View file

@ -136,17 +136,17 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [None],
"street": ["Example Street"],
"locality": [None],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"price": [200_000, 250_000],
"date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
"paon": ["1", "1"],
"saon": [None, None],
"street": ["Example-Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
}
).write_parquet(price_paid_path)
@ -172,3 +172,85 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
}
]
assert df.get_column("renovation_history").list.len().to_list() == [1]
assert df.get_column("historical_prices").list.len().to_list() == [2]
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000, 300_000],
"date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", ""],
"paon": ["1", "2"],
"saon": [None, None],
"street": ["Example Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df["postcode"].to_list() == ["AA1 1AA"]
def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row(address="1 Totally Different Road"))
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [None],
"street": ["Example Street"],
"locality": [None],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
assert df.select(
"pp_address",
"epc_address",
"total_floor_area",
"current_energy_rating",
).to_dicts() == [
{
"pp_address": "1 Example Street",
"epc_address": None,
"total_floor_area": None,
"current_energy_rating": None,
}
]

View file

@ -1,8 +1,14 @@
import polars as pl
import pytest
from pipeline.transform.merge import (
_AREA_COLUMNS,
TREE_DENSITY_FEATURE,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
_tree_density_by_postcode,
_validate_lad_source_coverage,
_validate_property_postcodes,
)
@ -36,3 +42,103 @@ def test_dynamic_poi_metric_columns_are_area_level() -> None:
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
def test_country_code_is_kept_in_postcode_area_columns() -> None:
assert "ctry25cd" in _AREA_COLUMNS
def test_validate_property_postcodes_rejects_blank_rows() -> None:
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA", ""],
"Address per Property Register": ["1 Example Street", "2 Example Street"],
"Last known price": [100_000, 200_000],
}
)
with pytest.raises(ValueError, match="Property rows missing a postcode"):
_validate_property_postcodes(df)
def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
tmp_path,
) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
"Local Authority District code (2024)": [
"E08000016",
"E06000053",
"E09000001",
],
"Local Authority District name (2024)": [
"Barnsley",
"Isles of Scilly",
"City of London",
],
}
).write_parquet(iod_path)
pl.DataFrame(
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
rental_path
)
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
"Local Authority District code (2024)": ["E08000016"],
"Local Authority District name (2024)": ["Barnsley"],
}
).write_parquet(iod_path)
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
rental_path
)
with pytest.raises(ValueError, match="Rental data is missing"):
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
path = tmp_path / "tree_density_by_postcode.parquet"
pl.DataFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"Tree canopy density percentile within 50m": [12.5, 99.0],
}
).write_parquet(path)
result = _tree_density_by_postcode(path).collect().sort("postcode")
assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
def test_tree_density_by_postcode_requires_postcode_and_density_columns(
tmp_path,
) -> None:
path = tmp_path / "tree_density_by_postcode.parquet"
pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
with pytest.raises(ValueError, match="must contain column"):
_tree_density_by_postcode(path)
missing_postcode_path = tmp_path / "missing_postcode.parquet"
pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
missing_postcode_path
)
with pytest.raises(ValueError, match="missing required column: postcode"):
_tree_density_by_postcode(missing_postcode_path)