perfect-postcode/pipeline/test_validate_outputs.py
2026-05-31 20:20:41 +01:00

224 lines
6.4 KiB
Python

from __future__ import annotations
import json
import zipfile
import polars as pl
from pipeline.validate_outputs import main
def polygon(offset=0):
x = float(offset)
return {
"type": "Polygon",
"coordinates": [
[(x, 0.0), (x + 0.001, 0.0), (x + 0.001, 0.001), (x, 0.001), (x, 0.0)]
],
}
def write_boundary(path, postcodes, geometries=None):
units = path / "units"
units.mkdir(parents=True)
features = [
{
"type": "Feature",
"properties": {"postcodes": postcode},
"geometry": (geometries[index] if geometries else polygon(index)),
}
for index, postcode in enumerate(postcodes)
]
(units / "AA1.geojson").write_text(
json.dumps({"type": "FeatureCollection", "features": features})
)
def test_validates_parquet_file_and_zip(tmp_path, monkeypatch):
parquet_path = tmp_path / "data.parquet"
file_path = tmp_path / "plain.txt"
zip_path = tmp_path / "archive.zip"
pl.DataFrame({"value": [1]}).write_parquet(parquet_path)
file_path.write_text("ok\n")
with zipfile.ZipFile(zip_path, "w") as archive:
archive.writestr("data.txt", "ok\n")
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--parquet",
str(parquet_path),
"--file",
str(file_path),
"--zip",
str(zip_path),
"--glob",
f"{tmp_path}::*.txt",
"--zip-glob",
f"{tmp_path}::*.zip",
],
)
assert main() == 0
def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys):
empty_path = tmp_path / "empty.txt"
empty_path.touch()
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--file",
str(empty_path),
"--parquet",
str(tmp_path / "missing.parquet"),
"--glob",
f"{tmp_path}::*.csv",
],
)
assert main() == 1
stderr = capsys.readouterr().err
assert "empty file" in stderr
assert "missing" in stderr
assert "no files matched" in stderr
def test_validates_postcode_boundary_matches(tmp_path, monkeypatch):
postcodes_path = tmp_path / "postcodes.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--postcode-boundary-match",
f"{postcodes_path}::{boundaries_path}",
],
)
assert main() == 0
def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys):
postcodes_path = tmp_path / "postcodes.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path)
write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--postcode-boundary-match",
f"{postcodes_path}::{boundaries_path}",
],
)
assert main() == 1
stderr = capsys.readouterr().err
assert "missing boundaries" in stderr
assert "boundary postcodes are absent" in stderr
def test_rejects_invalid_postcode_boundary_features(tmp_path, monkeypatch, capsys):
postcodes_path = tmp_path / "postcodes.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
units = boundaries_path / "units"
units.mkdir(parents=True)
pl.DataFrame({"postcode": ["AA1 1AA"]}).write_parquet(postcodes_path)
bowtie = {
"type": "Polygon",
"coordinates": [[(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]],
}
features = [
{
"type": "Feature",
"properties": {"postcodes": "AA1 1AA"},
"geometry": polygon(),
},
{
"type": "Feature",
"properties": {"postcodes": "AA1 1AA"},
"geometry": polygon(1),
},
{"type": "Feature", "properties": {}, "geometry": polygon(2)},
{"type": "Feature", "properties": {"postcodes": "AA1 1AB"}, "geometry": None},
{"type": "Feature", "properties": {"postcodes": "AA1 1AC"}, "geometry": bowtie},
]
(units / "AA1.geojson").write_text(
json.dumps({"type": "FeatureCollection", "features": features})
)
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--postcode-boundary-match",
f"{postcodes_path}::{boundaries_path}",
],
)
assert main() == 1
stderr = capsys.readouterr().err
assert "duplicate boundary postcode features" in stderr
assert "missing properties.postcodes" in stderr
assert "missing or empty geometry" in stderr
assert "invalid boundary geometries" in stderr
def test_validates_active_english_arcgis_boundary_matches(tmp_path, monkeypatch):
arcgis_path = tmp_path / "arcgis.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, "2020-01-01", None],
}
).write_parquet(arcgis_path)
write_boundary(boundaries_path, ["AA1 1AA"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--active-postcode-boundary-match",
f"{arcgis_path}::{boundaries_path}",
],
)
assert main() == 0
def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch, capsys):
arcgis_path = tmp_path / "arcgis.parquet"
boundaries_path = tmp_path / "postcode_boundaries"
pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, None, None],
}
).write_parquet(arcgis_path)
write_boundary(boundaries_path, ["AA1 1AA", "CF1 1AA"])
monkeypatch.setattr(
"sys.argv",
[
"validate_outputs",
"--active-postcode-boundary-match",
f"{arcgis_path}::{boundaries_path}",
],
)
assert main() == 1
stderr = capsys.readouterr().err
assert "active English postcodes" in stderr
assert "not active English postcodes" in stderr