Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -222,3 +222,108 @@ def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch,
stderr = capsys.readouterr().err
assert "active English postcodes" in stderr
assert "not active English postcodes" in stderr
def _write_postcode_features(path, rows):
pl.DataFrame(rows).write_parquet(path)
def test_validates_postcode_features_valid(tmp_path, monkeypatch):
path = tmp_path / "postcode.parquet"
_write_postcode_features(
path,
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.5, 53.4],
"lon": [-0.1, -2.2],
"ctry25cd": ["E92000001", "E92000001"],
"% White": [80.0, 55.0],
},
)
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
assert main() == 0
def test_rejects_contaminated_postcode_features(tmp_path, monkeypatch, capsys):
path = tmp_path / "postcode.parquet"
_write_postcode_features(
path,
{
"Postcode": ["AA1 1AA", "AA1 1AA", "CF10 1AA"], # duplicate AA1 1AA
"lat": [51.5, 51.5, None], # Welsh row has null coord
"lon": [-0.1, -0.1, None],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"% White": [80.0, 150.0, 90.0], # 150 out of [0,100]
},
)
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
assert main() == 1
err = capsys.readouterr().err
assert "not unique" in err
assert "E92000001" in err # country contamination
assert "out-of-England" in err or "lat/lon" in err
assert "[0, 100]" in err
def test_validates_properties_subset(tmp_path, monkeypatch):
postcode = tmp_path / "postcode.parquet"
properties = tmp_path / "properties.parquet"
pl.DataFrame({"Postcode": ["AA1 1AA", "BB1 1BB"]}).write_parquet(postcode)
pl.DataFrame(
{"Postcode": ["AA1 1AA"], "Last known price": [250_000]}
).write_parquet(properties)
monkeypatch.setattr(
"sys.argv",
["validate", "--properties-subset", f"{properties}::{postcode}"],
)
assert main() == 0
def test_rejects_orphan_properties(tmp_path, monkeypatch, capsys):
postcode = tmp_path / "postcode.parquet"
properties = tmp_path / "properties.parquet"
pl.DataFrame({"Postcode": ["AA1 1AA"]}).write_parquet(postcode)
pl.DataFrame(
{"Postcode": ["CC1 1CC"], "Last known price": [-5]} # orphan + negative price
).write_parquet(properties)
monkeypatch.setattr(
"sys.argv",
["validate", "--properties-subset", f"{properties}::{postcode}"],
)
assert main() == 1
err = capsys.readouterr().err
assert "absent from" in err
assert "non-positive" in err
def test_validates_price_index_allows_zero_n_pairs(tmp_path, monkeypatch):
path = tmp_path / "price_index.parquet"
pl.DataFrame(
{
"sector": ["A1 1", "A1 1", "B2 2"],
"type_group": ["All", "Detached", "All"],
"year": [2024, 2024, 2024],
"log_index": [0.5, 0.4, 0.0],
"n_pairs": [100, 0, 0], # zero n_pairs is a legitimate fallback
}
).write_parquet(path)
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
assert main() == 0
def test_rejects_price_index_nonfinite_and_duplicate(tmp_path, monkeypatch, capsys):
path = tmp_path / "price_index.parquet"
pl.DataFrame(
{
"sector": ["A1 1", "A1 1"],
"type_group": ["All", "All"], # duplicate (sector, type_group, year)
"year": [2024, 2024],
"log_index": [float("inf"), 0.3], # non-finite
"n_pairs": [10, 10],
}
).write_parquet(path)
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
assert main() == 1
err = capsys.readouterr().err
assert "non-finite" in err
assert "not unique" in err