lgtm
This commit is contained in:
parent
a08b5d2ae0
commit
b98f0e3904
38 changed files with 3732 additions and 483 deletions
61
pipeline/test_validate_outputs.py
Normal file
61
pipeline/test_validate_outputs.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import zipfile
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.validate_outputs import main
|
||||
|
||||
|
||||
def test_validates_parquet_file_and_zip(tmp_path, monkeypatch):
|
||||
parquet_path = tmp_path / "data.parquet"
|
||||
file_path = tmp_path / "plain.txt"
|
||||
zip_path = tmp_path / "archive.zip"
|
||||
|
||||
pl.DataFrame({"value": [1]}).write_parquet(parquet_path)
|
||||
file_path.write_text("ok\n")
|
||||
with zipfile.ZipFile(zip_path, "w") as archive:
|
||||
archive.writestr("data.txt", "ok\n")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"sys.argv",
|
||||
[
|
||||
"validate_outputs",
|
||||
"--parquet",
|
||||
str(parquet_path),
|
||||
"--file",
|
||||
str(file_path),
|
||||
"--zip",
|
||||
str(zip_path),
|
||||
"--glob",
|
||||
f"{tmp_path}::*.txt",
|
||||
"--zip-glob",
|
||||
f"{tmp_path}::*.zip",
|
||||
],
|
||||
)
|
||||
|
||||
assert main() == 0
|
||||
|
||||
|
||||
def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys):
|
||||
empty_path = tmp_path / "empty.txt"
|
||||
empty_path.touch()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"sys.argv",
|
||||
[
|
||||
"validate_outputs",
|
||||
"--file",
|
||||
str(empty_path),
|
||||
"--parquet",
|
||||
str(tmp_path / "missing.parquet"),
|
||||
"--glob",
|
||||
f"{tmp_path}::*.csv",
|
||||
],
|
||||
)
|
||||
|
||||
assert main() == 1
|
||||
stderr = capsys.readouterr().err
|
||||
assert "empty file" in stderr
|
||||
assert "missing" in stderr
|
||||
assert "no files matched" in stderr
|
||||
|
|
@ -71,3 +71,64 @@ def test_fuzzy_join_on_postcode_requires_matching_numbers():
|
|||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == [None]
|
||||
|
||||
|
||||
def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches():
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["1 Example Street"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["1 Totally Different Road"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == [None]
|
||||
|
||||
|
||||
def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_id": ["blank", "number_only", "valid"],
|
||||
"left_address": [" ", "10", "10 High Street"],
|
||||
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["", "10", "10 High Street"],
|
||||
"right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = (
|
||||
fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
)
|
||||
.sort("left_id")
|
||||
.collect()
|
||||
)
|
||||
|
||||
assert result.select("left_id", "right_address").to_dicts() == [
|
||||
{"left_id": "blank", "right_address": None},
|
||||
{"left_id": "number_only", "right_address": None},
|
||||
{"left_id": "valid", "right_address": "10 High Street"},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -101,6 +101,33 @@ def test_custom_radius(pois):
|
|||
assert total <= 2 # at most the co-located POIs
|
||||
|
||||
|
||||
def test_counts_pois_across_multiple_grid_cells_within_5km():
|
||||
"""A POI around 4.8km away must not be dropped by grid candidate lookup."""
|
||||
postcodes = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["GRID 5KM"],
|
||||
"lat": [51.5],
|
||||
"lon": [0.049],
|
||||
}
|
||||
)
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"lat": [51.5, 51.5],
|
||||
"lng": [0.1183, 0.1240],
|
||||
"category": ["Park", "Park"],
|
||||
}
|
||||
)
|
||||
|
||||
result = count_pois_per_postcode(
|
||||
postcodes,
|
||||
pois,
|
||||
groups={"parks": ["Park"]},
|
||||
radius_km=5.0,
|
||||
)
|
||||
|
||||
assert result["parks_5km"][0] == 1
|
||||
|
||||
|
||||
def test_min_distance_finds_nearest(postcodes, pois):
|
||||
"""min_distance_per_postcode returns distance to closest POI per group."""
|
||||
result = min_distance_per_postcode(postcodes, pois, groups=POI_GROUPS)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue