395 lines
12 KiB
Python
395 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import zipfile
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.validate_outputs import main
|
|
|
|
|
|
def polygon(offset=0):
|
|
x = float(offset)
|
|
return {
|
|
"type": "Polygon",
|
|
"coordinates": [
|
|
[(x, 0.0), (x + 0.001, 0.0), (x + 0.001, 0.001), (x, 0.001), (x, 0.0)]
|
|
],
|
|
}
|
|
|
|
|
|
def write_boundary(path, postcodes, geometries=None):
|
|
units = path / "units"
|
|
units.mkdir(parents=True)
|
|
features = [
|
|
{
|
|
"type": "Feature",
|
|
"properties": {"postcodes": postcode},
|
|
"geometry": (geometries[index] if geometries else polygon(index)),
|
|
}
|
|
for index, postcode in enumerate(postcodes)
|
|
]
|
|
(units / "AA1.geojson").write_text(
|
|
json.dumps({"type": "FeatureCollection", "features": features})
|
|
)
|
|
|
|
|
|
def test_validates_parquet_file_and_zip(tmp_path, monkeypatch):
|
|
parquet_path = tmp_path / "data.parquet"
|
|
file_path = tmp_path / "plain.txt"
|
|
zip_path = tmp_path / "archive.zip"
|
|
|
|
pl.DataFrame({"value": [1]}).write_parquet(parquet_path)
|
|
file_path.write_text("ok\n")
|
|
with zipfile.ZipFile(zip_path, "w") as archive:
|
|
archive.writestr("data.txt", "ok\n")
|
|
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate_outputs",
|
|
"--parquet",
|
|
str(parquet_path),
|
|
"--file",
|
|
str(file_path),
|
|
"--zip",
|
|
str(zip_path),
|
|
"--glob",
|
|
f"{tmp_path}::*.txt",
|
|
"--zip-glob",
|
|
f"{tmp_path}::*.zip",
|
|
],
|
|
)
|
|
|
|
assert main() == 0
|
|
|
|
|
|
def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys):
|
|
empty_path = tmp_path / "empty.txt"
|
|
empty_path.touch()
|
|
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate_outputs",
|
|
"--file",
|
|
str(empty_path),
|
|
"--parquet",
|
|
str(tmp_path / "missing.parquet"),
|
|
"--glob",
|
|
f"{tmp_path}::*.csv",
|
|
],
|
|
)
|
|
|
|
assert main() == 1
|
|
stderr = capsys.readouterr().err
|
|
assert "empty file" in stderr
|
|
assert "missing" in stderr
|
|
assert "no files matched" in stderr
|
|
|
|
|
|
def test_validates_postcode_boundary_matches(tmp_path, monkeypatch):
|
|
postcodes_path = tmp_path / "postcodes.parquet"
|
|
boundaries_path = tmp_path / "postcode_boundaries"
|
|
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
|
|
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"])
|
|
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate_outputs",
|
|
"--postcode-boundary-match",
|
|
f"{postcodes_path}::{boundaries_path}",
|
|
],
|
|
)
|
|
|
|
assert main() == 0
|
|
|
|
|
|
def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys):
|
|
postcodes_path = tmp_path / "postcodes.parquet"
|
|
boundaries_path = tmp_path / "postcode_boundaries"
|
|
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
|
|
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"])
|
|
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate_outputs",
|
|
"--postcode-boundary-match",
|
|
f"{postcodes_path}::{boundaries_path}",
|
|
],
|
|
)
|
|
|
|
assert main() == 1
|
|
stderr = capsys.readouterr().err
|
|
assert "missing boundaries" in stderr
|
|
assert "boundary postcodes are absent" in stderr
|
|
|
|
|
|
def test_rejects_invalid_postcode_boundary_features(tmp_path, monkeypatch, capsys):
|
|
postcodes_path = tmp_path / "postcodes.parquet"
|
|
boundaries_path = tmp_path / "postcode_boundaries"
|
|
units = boundaries_path / "units"
|
|
units.mkdir(parents=True)
|
|
pl.DataFrame({"postcode": ["AA1 1AA"]}).write_parquet(postcodes_path)
|
|
bowtie = {
|
|
"type": "Polygon",
|
|
"coordinates": [[(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]],
|
|
}
|
|
features = [
|
|
{
|
|
"type": "Feature",
|
|
"properties": {"postcodes": "AA1 1AA"},
|
|
"geometry": polygon(),
|
|
},
|
|
{
|
|
"type": "Feature",
|
|
"properties": {"postcodes": "AA1 1AA"},
|
|
"geometry": polygon(1),
|
|
},
|
|
{"type": "Feature", "properties": {}, "geometry": polygon(2)},
|
|
{"type": "Feature", "properties": {"postcodes": "AA1 1AB"}, "geometry": None},
|
|
{"type": "Feature", "properties": {"postcodes": "AA1 1AC"}, "geometry": bowtie},
|
|
]
|
|
(units / "AA1.geojson").write_text(
|
|
json.dumps({"type": "FeatureCollection", "features": features})
|
|
)
|
|
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate_outputs",
|
|
"--postcode-boundary-match",
|
|
f"{postcodes_path}::{boundaries_path}",
|
|
],
|
|
)
|
|
|
|
assert main() == 1
|
|
stderr = capsys.readouterr().err
|
|
assert "duplicate boundary postcode features" in stderr
|
|
assert "missing properties.postcodes" in stderr
|
|
assert "missing or empty geometry" in stderr
|
|
assert "invalid boundary geometries" in stderr
|
|
|
|
|
|
def test_validates_active_english_arcgis_boundary_matches(tmp_path, monkeypatch):
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
boundaries_path = tmp_path / "postcode_boundaries"
|
|
pl.DataFrame(
|
|
{
|
|
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
|
|
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
|
|
"doterm": [None, "2020-01-01", None],
|
|
}
|
|
).write_parquet(arcgis_path)
|
|
write_boundary(boundaries_path, ["AA1 1AA"])
|
|
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate_outputs",
|
|
"--active-postcode-boundary-match",
|
|
f"{arcgis_path}::{boundaries_path}",
|
|
],
|
|
)
|
|
|
|
assert main() == 0
|
|
|
|
|
|
def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch, capsys):
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
boundaries_path = tmp_path / "postcode_boundaries"
|
|
pl.DataFrame(
|
|
{
|
|
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
|
|
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
|
|
"doterm": [None, None, None],
|
|
}
|
|
).write_parquet(arcgis_path)
|
|
write_boundary(boundaries_path, ["AA1 1AA", "CF1 1AA"])
|
|
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate_outputs",
|
|
"--active-postcode-boundary-match",
|
|
f"{arcgis_path}::{boundaries_path}",
|
|
],
|
|
)
|
|
|
|
assert main() == 1
|
|
stderr = capsys.readouterr().err
|
|
assert "active English postcodes" in stderr
|
|
assert "not active English postcodes" in stderr
|
|
|
|
|
|
def _write_postcode_features(path, rows):
|
|
pl.DataFrame(rows).write_parquet(path)
|
|
|
|
|
|
def test_validates_postcode_features_valid(tmp_path, monkeypatch):
|
|
path = tmp_path / "postcode.parquet"
|
|
_write_postcode_features(
|
|
path,
|
|
{
|
|
"Postcode": ["AA1 1AA", "BB1 1BB"],
|
|
"lat": [51.5, 53.4],
|
|
"lon": [-0.1, -2.2],
|
|
"ctry25cd": ["E92000001", "E92000001"],
|
|
"% White": [80.0, 55.0],
|
|
},
|
|
)
|
|
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
|
|
assert main() == 0
|
|
|
|
|
|
def test_rejects_contaminated_postcode_features(tmp_path, monkeypatch, capsys):
|
|
path = tmp_path / "postcode.parquet"
|
|
_write_postcode_features(
|
|
path,
|
|
{
|
|
"Postcode": ["AA1 1AA", "AA1 1AA", "CF10 1AA"], # duplicate AA1 1AA
|
|
"lat": [51.5, 51.5, None], # Welsh row has null coord
|
|
"lon": [-0.1, -0.1, None],
|
|
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
|
|
"% White": [80.0, 150.0, 90.0], # 150 out of [0,100]
|
|
},
|
|
)
|
|
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
|
|
assert main() == 1
|
|
err = capsys.readouterr().err
|
|
assert "not unique" in err
|
|
assert "E92000001" in err # country contamination
|
|
assert "out-of-England" in err or "lat/lon" in err
|
|
assert "[0, 100]" in err
|
|
|
|
|
|
def test_postcode_universe_rejects_missing(tmp_path, monkeypatch, capsys):
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
postcodes_path = tmp_path / "postcode.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"pcds": ["AA1 1AA", "AA1 1AB", "AA1 1AC"],
|
|
"ctry25cd": ["E92000001", "E92000001", "E92000001"],
|
|
"doterm": [None, None, None],
|
|
}
|
|
).write_parquet(arcgis_path)
|
|
# Only 1 of the 3 active English postcodes is present, all otherwise valid.
|
|
_write_postcode_features(
|
|
postcodes_path,
|
|
{
|
|
"Postcode": ["AA1 1AA"],
|
|
"lat": [51.5],
|
|
"lon": [-0.1],
|
|
"ctry25cd": ["E92000001"],
|
|
"% White": [80.0],
|
|
},
|
|
)
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate",
|
|
"--postcode-universe",
|
|
f"{arcgis_path}::{postcodes_path}",
|
|
],
|
|
)
|
|
assert main() == 1
|
|
err = capsys.readouterr().err
|
|
assert "missing" in err
|
|
assert "2" in err # 2 of the 3 active postcodes are absent
|
|
|
|
|
|
def test_postcode_universe_accepts_exact_match(tmp_path, monkeypatch):
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
postcodes_path = tmp_path / "postcode.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"pcds": ["AA1 1AA", "AA1 1AB"],
|
|
"ctry25cd": ["E92000001", "E92000001"],
|
|
"doterm": [None, None],
|
|
}
|
|
).write_parquet(arcgis_path)
|
|
_write_postcode_features(
|
|
postcodes_path,
|
|
{
|
|
"Postcode": ["AA1 1AA", "AA1 1AB"],
|
|
"lat": [51.5, 53.4],
|
|
"lon": [-0.1, -2.2],
|
|
"ctry25cd": ["E92000001", "E92000001"],
|
|
"% White": [80.0, 55.0],
|
|
},
|
|
)
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
[
|
|
"validate",
|
|
"--postcode-universe",
|
|
f"{arcgis_path}::{postcodes_path}",
|
|
],
|
|
)
|
|
assert main() == 0
|
|
|
|
|
|
def test_validates_properties_subset(tmp_path, monkeypatch):
|
|
postcode = tmp_path / "postcode.parquet"
|
|
properties = tmp_path / "properties.parquet"
|
|
pl.DataFrame({"Postcode": ["AA1 1AA", "BB1 1BB"]}).write_parquet(postcode)
|
|
pl.DataFrame(
|
|
{"Postcode": ["AA1 1AA"], "Last known price": [250_000]}
|
|
).write_parquet(properties)
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
["validate", "--properties-subset", f"{properties}::{postcode}"],
|
|
)
|
|
assert main() == 0
|
|
|
|
|
|
def test_rejects_orphan_properties(tmp_path, monkeypatch, capsys):
|
|
postcode = tmp_path / "postcode.parquet"
|
|
properties = tmp_path / "properties.parquet"
|
|
pl.DataFrame({"Postcode": ["AA1 1AA"]}).write_parquet(postcode)
|
|
pl.DataFrame(
|
|
{"Postcode": ["CC1 1CC"], "Last known price": [-5]} # orphan + negative price
|
|
).write_parquet(properties)
|
|
monkeypatch.setattr(
|
|
"sys.argv",
|
|
["validate", "--properties-subset", f"{properties}::{postcode}"],
|
|
)
|
|
assert main() == 1
|
|
err = capsys.readouterr().err
|
|
assert "absent from" in err
|
|
assert "non-positive" in err
|
|
|
|
|
|
def test_validates_price_index_allows_zero_n_pairs(tmp_path, monkeypatch):
|
|
path = tmp_path / "price_index.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"sector": ["A1 1", "A1 1", "B2 2"],
|
|
"type_group": ["All", "Detached", "All"],
|
|
"year": [2024, 2024, 2024],
|
|
"log_index": [0.5, 0.4, 0.0],
|
|
"n_pairs": [100, 0, 0], # zero n_pairs is a legitimate fallback
|
|
}
|
|
).write_parquet(path)
|
|
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
|
|
assert main() == 0
|
|
|
|
|
|
def test_rejects_price_index_nonfinite_and_duplicate(tmp_path, monkeypatch, capsys):
|
|
path = tmp_path / "price_index.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"sector": ["A1 1", "A1 1"],
|
|
"type_group": ["All", "All"], # duplicate (sector, type_group, year)
|
|
"year": [2024, 2024],
|
|
"log_index": [float("inf"), 0.3], # non-finite
|
|
"n_pairs": [10, 10],
|
|
}
|
|
).write_parquet(path)
|
|
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
|
|
assert main() == 1
|
|
err = capsys.readouterr().err
|
|
assert "non-finite" in err
|
|
assert "not unique" in err
|