Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -222,3 +222,108 @@ def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch,
|
|||
stderr = capsys.readouterr().err
|
||||
assert "active English postcodes" in stderr
|
||||
assert "not active English postcodes" in stderr
|
||||
|
||||
|
||||
def _write_postcode_features(path, rows):
|
||||
pl.DataFrame(rows).write_parquet(path)
|
||||
|
||||
|
||||
def test_validates_postcode_features_valid(tmp_path, monkeypatch):
|
||||
path = tmp_path / "postcode.parquet"
|
||||
_write_postcode_features(
|
||||
path,
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "BB1 1BB"],
|
||||
"lat": [51.5, 53.4],
|
||||
"lon": [-0.1, -2.2],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
"% White": [80.0, 55.0],
|
||||
},
|
||||
)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
|
||||
assert main() == 0
|
||||
|
||||
|
||||
def test_rejects_contaminated_postcode_features(tmp_path, monkeypatch, capsys):
|
||||
path = tmp_path / "postcode.parquet"
|
||||
_write_postcode_features(
|
||||
path,
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "AA1 1AA", "CF10 1AA"], # duplicate AA1 1AA
|
||||
"lat": [51.5, 51.5, None], # Welsh row has null coord
|
||||
"lon": [-0.1, -0.1, None],
|
||||
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
|
||||
"% White": [80.0, 150.0, 90.0], # 150 out of [0,100]
|
||||
},
|
||||
)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
|
||||
assert main() == 1
|
||||
err = capsys.readouterr().err
|
||||
assert "not unique" in err
|
||||
assert "E92000001" in err # country contamination
|
||||
assert "out-of-England" in err or "lat/lon" in err
|
||||
assert "[0, 100]" in err
|
||||
|
||||
|
||||
def test_validates_properties_subset(tmp_path, monkeypatch):
|
||||
postcode = tmp_path / "postcode.parquet"
|
||||
properties = tmp_path / "properties.parquet"
|
||||
pl.DataFrame({"Postcode": ["AA1 1AA", "BB1 1BB"]}).write_parquet(postcode)
|
||||
pl.DataFrame(
|
||||
{"Postcode": ["AA1 1AA"], "Last known price": [250_000]}
|
||||
).write_parquet(properties)
|
||||
monkeypatch.setattr(
|
||||
"sys.argv",
|
||||
["validate", "--properties-subset", f"{properties}::{postcode}"],
|
||||
)
|
||||
assert main() == 0
|
||||
|
||||
|
||||
def test_rejects_orphan_properties(tmp_path, monkeypatch, capsys):
|
||||
postcode = tmp_path / "postcode.parquet"
|
||||
properties = tmp_path / "properties.parquet"
|
||||
pl.DataFrame({"Postcode": ["AA1 1AA"]}).write_parquet(postcode)
|
||||
pl.DataFrame(
|
||||
{"Postcode": ["CC1 1CC"], "Last known price": [-5]} # orphan + negative price
|
||||
).write_parquet(properties)
|
||||
monkeypatch.setattr(
|
||||
"sys.argv",
|
||||
["validate", "--properties-subset", f"{properties}::{postcode}"],
|
||||
)
|
||||
assert main() == 1
|
||||
err = capsys.readouterr().err
|
||||
assert "absent from" in err
|
||||
assert "non-positive" in err
|
||||
|
||||
|
||||
def test_validates_price_index_allows_zero_n_pairs(tmp_path, monkeypatch):
|
||||
path = tmp_path / "price_index.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"sector": ["A1 1", "A1 1", "B2 2"],
|
||||
"type_group": ["All", "Detached", "All"],
|
||||
"year": [2024, 2024, 2024],
|
||||
"log_index": [0.5, 0.4, 0.0],
|
||||
"n_pairs": [100, 0, 0], # zero n_pairs is a legitimate fallback
|
||||
}
|
||||
).write_parquet(path)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
|
||||
assert main() == 0
|
||||
|
||||
|
||||
def test_rejects_price_index_nonfinite_and_duplicate(tmp_path, monkeypatch, capsys):
|
||||
path = tmp_path / "price_index.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"sector": ["A1 1", "A1 1"],
|
||||
"type_group": ["All", "All"], # duplicate (sector, type_group, year)
|
||||
"year": [2024, 2024],
|
||||
"log_index": [float("inf"), 0.3], # non-finite
|
||||
"n_pairs": [10, 10],
|
||||
}
|
||||
).write_parquet(path)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
|
||||
assert main() == 1
|
||||
err = capsys.readouterr().err
|
||||
assert "non-finite" in err
|
||||
assert "not unique" in err
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue