from __future__ import annotations import json import zipfile import polars as pl from pipeline.validate_outputs import main def polygon(offset=0): x = float(offset) return { "type": "Polygon", "coordinates": [ [(x, 0.0), (x + 0.001, 0.0), (x + 0.001, 0.001), (x, 0.001), (x, 0.0)] ], } def write_boundary(path, postcodes, geometries=None): units = path / "units" units.mkdir(parents=True) features = [ { "type": "Feature", "properties": {"postcodes": postcode}, "geometry": (geometries[index] if geometries else polygon(index)), } for index, postcode in enumerate(postcodes) ] (units / "AA1.geojson").write_text( json.dumps({"type": "FeatureCollection", "features": features}) ) def test_validates_parquet_file_and_zip(tmp_path, monkeypatch): parquet_path = tmp_path / "data.parquet" file_path = tmp_path / "plain.txt" zip_path = tmp_path / "archive.zip" pl.DataFrame({"value": [1]}).write_parquet(parquet_path) file_path.write_text("ok\n") with zipfile.ZipFile(zip_path, "w") as archive: archive.writestr("data.txt", "ok\n") monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--parquet", str(parquet_path), "--file", str(file_path), "--zip", str(zip_path), "--glob", f"{tmp_path}::*.txt", "--zip-glob", f"{tmp_path}::*.zip", ], ) assert main() == 0 def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys): empty_path = tmp_path / "empty.txt" empty_path.touch() monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--file", str(empty_path), "--parquet", str(tmp_path / "missing.parquet"), "--glob", f"{tmp_path}::*.csv", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "empty file" in stderr assert "missing" in stderr assert "no files matched" in stderr def test_validates_postcode_boundary_matches(tmp_path, monkeypatch): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 0 def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "missing boundaries" in stderr assert "boundary postcodes are absent" in stderr def test_rejects_invalid_postcode_boundary_features(tmp_path, monkeypatch, capsys): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" units = boundaries_path / "units" units.mkdir(parents=True) pl.DataFrame({"postcode": ["AA1 1AA"]}).write_parquet(postcodes_path) bowtie = { "type": "Polygon", "coordinates": [[(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]], } features = [ { "type": "Feature", "properties": {"postcodes": "AA1 1AA"}, "geometry": polygon(), }, { "type": "Feature", "properties": {"postcodes": "AA1 1AA"}, "geometry": polygon(1), }, {"type": "Feature", "properties": {}, "geometry": polygon(2)}, {"type": "Feature", "properties": {"postcodes": "AA1 1AB"}, "geometry": None}, {"type": "Feature", "properties": {"postcodes": "AA1 1AC"}, "geometry": bowtie}, ] (units / "AA1.geojson").write_text( json.dumps({"type": "FeatureCollection", "features": features}) ) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "duplicate boundary postcode features" in stderr assert "missing properties.postcodes" in stderr assert "missing or empty geometry" in stderr assert "invalid boundary geometries" in stderr def test_validates_active_english_arcgis_boundary_matches(tmp_path, monkeypatch): arcgis_path = tmp_path / "arcgis.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame( { "pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"], "ctry25cd": ["E92000001", "E92000001", "W92000004"], "doterm": [None, "2020-01-01", None], } ).write_parquet(arcgis_path) write_boundary(boundaries_path, ["AA1 1AA"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--active-postcode-boundary-match", f"{arcgis_path}::{boundaries_path}", ], ) assert main() == 0 def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch, capsys): arcgis_path = tmp_path / "arcgis.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame( { "pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"], "ctry25cd": ["E92000001", "E92000001", "W92000004"], "doterm": [None, None, None], } ).write_parquet(arcgis_path) write_boundary(boundaries_path, ["AA1 1AA", "CF1 1AA"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--active-postcode-boundary-match", f"{arcgis_path}::{boundaries_path}", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "active English postcodes" in stderr assert "not active English postcodes" in stderr def _write_postcode_features(path, rows): pl.DataFrame(rows).write_parquet(path) def test_validates_postcode_features_valid(tmp_path, monkeypatch): path = tmp_path / "postcode.parquet" _write_postcode_features( path, { "Postcode": ["AA1 1AA", "BB1 1BB"], "lat": [51.5, 53.4], "lon": [-0.1, -2.2], "ctry25cd": ["E92000001", "E92000001"], "% White": [80.0, 55.0], }, ) monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)]) assert main() == 0 def test_rejects_contaminated_postcode_features(tmp_path, monkeypatch, capsys): path = tmp_path / "postcode.parquet" _write_postcode_features( path, { "Postcode": ["AA1 1AA", "AA1 1AA", "CF10 1AA"], # duplicate AA1 1AA "lat": [51.5, 51.5, None], # Welsh row has null coord "lon": [-0.1, -0.1, None], "ctry25cd": ["E92000001", "E92000001", "W92000004"], "% White": [80.0, 150.0, 90.0], # 150 out of [0,100] }, ) monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)]) assert main() == 1 err = capsys.readouterr().err assert "not unique" in err assert "E92000001" in err # country contamination assert "out-of-England" in err or "lat/lon" in err assert "[0, 100]" in err def test_postcode_universe_rejects_missing(tmp_path, monkeypatch, capsys): arcgis_path = tmp_path / "arcgis.parquet" postcodes_path = tmp_path / "postcode.parquet" pl.DataFrame( { "pcds": ["AA1 1AA", "AA1 1AB", "AA1 1AC"], "ctry25cd": ["E92000001", "E92000001", "E92000001"], "doterm": [None, None, None], } ).write_parquet(arcgis_path) # Only 1 of the 3 active English postcodes is present, all otherwise valid. _write_postcode_features( postcodes_path, { "Postcode": ["AA1 1AA"], "lat": [51.5], "lon": [-0.1], "ctry25cd": ["E92000001"], "% White": [80.0], }, ) monkeypatch.setattr( "sys.argv", [ "validate", "--postcode-universe", f"{arcgis_path}::{postcodes_path}", ], ) assert main() == 1 err = capsys.readouterr().err assert "missing" in err assert "2" in err # 2 of the 3 active postcodes are absent def test_postcode_universe_accepts_exact_match(tmp_path, monkeypatch): arcgis_path = tmp_path / "arcgis.parquet" postcodes_path = tmp_path / "postcode.parquet" pl.DataFrame( { "pcds": ["AA1 1AA", "AA1 1AB"], "ctry25cd": ["E92000001", "E92000001"], "doterm": [None, None], } ).write_parquet(arcgis_path) _write_postcode_features( postcodes_path, { "Postcode": ["AA1 1AA", "AA1 1AB"], "lat": [51.5, 53.4], "lon": [-0.1, -2.2], "ctry25cd": ["E92000001", "E92000001"], "% White": [80.0, 55.0], }, ) monkeypatch.setattr( "sys.argv", [ "validate", "--postcode-universe", f"{arcgis_path}::{postcodes_path}", ], ) assert main() == 0 def test_validates_properties_subset(tmp_path, monkeypatch): postcode = tmp_path / "postcode.parquet" properties = tmp_path / "properties.parquet" pl.DataFrame({"Postcode": ["AA1 1AA", "BB1 1BB"]}).write_parquet(postcode) pl.DataFrame( {"Postcode": ["AA1 1AA"], "Last known price": [250_000]} ).write_parquet(properties) monkeypatch.setattr( "sys.argv", ["validate", "--properties-subset", f"{properties}::{postcode}"], ) assert main() == 0 def test_rejects_orphan_properties(tmp_path, monkeypatch, capsys): postcode = tmp_path / "postcode.parquet" properties = tmp_path / "properties.parquet" pl.DataFrame({"Postcode": ["AA1 1AA"]}).write_parquet(postcode) pl.DataFrame( {"Postcode": ["CC1 1CC"], "Last known price": [-5]} # orphan + negative price ).write_parquet(properties) monkeypatch.setattr( "sys.argv", ["validate", "--properties-subset", f"{properties}::{postcode}"], ) assert main() == 1 err = capsys.readouterr().err assert "absent from" in err assert "non-positive" in err def test_validates_price_index_allows_zero_n_pairs(tmp_path, monkeypatch): path = tmp_path / "price_index.parquet" pl.DataFrame( { "sector": ["A1 1", "A1 1", "B2 2"], "type_group": ["All", "Detached", "All"], "year": [2024, 2024, 2024], "log_index": [0.5, 0.4, 0.0], "n_pairs": [100, 0, 0], # zero n_pairs is a legitimate fallback } ).write_parquet(path) monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)]) assert main() == 0 def test_rejects_price_index_nonfinite_and_duplicate(tmp_path, monkeypatch, capsys): path = tmp_path / "price_index.parquet" pl.DataFrame( { "sector": ["A1 1", "A1 1"], "type_group": ["All", "All"], # duplicate (sector, type_group, year) "year": [2024, 2024], "log_index": [float("inf"), 0.3], # non-finite "n_pairs": [10, 10], } ).write_parquet(path) monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)]) assert main() == 1 err = capsys.readouterr().err assert "non-finite" in err assert "not unique" in err