Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -8,6 +8,7 @@ import polars as pl
from pipeline.transform.join_epc_pp import (
EPC_SOURCE_COLUMNS,
_join_address_parts,
_run,
_scan_epc_certificates,
)
@ -111,6 +112,89 @@ def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
assert df.schema["number_habitable_rooms"] == pl.Int16
def test_join_address_parts_empty_string_components():
# Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
# concat_str(ignore_nulls=True) alone leaked the separator into the
# display address (' 10 PALACE GREEN') and doubled it for empty middle
# components. Empty/whitespace-only parts must contribute nothing.
df = pl.DataFrame(
{
"saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, " ", " FLAT 2"],
"paon": ["10", "10", "", "82", "", None, "10", "11 "],
"street": [
"PALACE GREEN",
"HIGH STREET",
"HIGH STREET",
"",
"",
None,
"PALACE GREEN",
"STATION ROAD",
],
}
)
out = df.select(
_join_address_parts("saon", "paon", "street").alias("address")
).get_column("address")
assert out.to_list() == [
"10 PALACE GREEN", # empty saon -> no leading space
"FLAT 1 10 HIGH STREET", # normal three-part address is unchanged
"FLAT 1 HIGH STREET", # empty middle component -> no double space
"FLAT 21 82", # empty street -> no trailing space
None, # all-empty -> null, not whitespace junk
None, # all-null -> null
"10 PALACE GREEN", # whitespace-only component treated as empty
"FLAT 2 11 STATION ROAD", # per-component padding is stripped
]
# Invariant: every produced address is trimmed and single-spaced.
produced = out.drop_nulls()
assert produced.str.starts_with(" ").sum() == 0
assert produced.str.ends_with(" ").sum() == 0
assert produced.str.contains(" ", literal=True).sum() == 0
def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
# Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
# published pp_address must not inherit a leading separator from it.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [""],
"street": ["Example Street"],
"locality": [""],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"ppd_category": ["A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# No leading space, and the clean address still matches its EPC record.
assert df.select("pp_address", "epc_address").to_dicts() == [
{"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
]
def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: