This commit is contained in:
Andras Schmelczer 2026-05-13 12:11:54 +01:00
parent a08b5d2ae0
commit b98f0e3904
38 changed files with 3732 additions and 483 deletions

View file

@ -71,3 +71,64 @@ def test_fuzzy_join_on_postcode_requires_matching_numbers():
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_on_postcode_rejects_low_score_same_number_matches():
left = pl.LazyFrame(
{
"left_address": ["1 Example Street"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["1 Totally Different Road"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]
def test_fuzzy_join_on_postcode_rejects_blank_and_invalid_match_keys():
left = pl.LazyFrame(
{
"left_id": ["blank", "number_only", "valid"],
"left_address": [" ", "10", "10 High Street"],
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["", "10", "10 High Street"],
"right_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
result = (
fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
)
.sort("left_id")
.collect()
)
assert result.select("left_id", "right_address").to_dicts() == [
{"left_id": "blank", "right_address": None},
{"left_id": "number_only", "right_address": None},
{"left_id": "valid", "right_address": "10 High Street"},
]

View file

@ -101,6 +101,33 @@ def test_custom_radius(pois):
assert total <= 2 # at most the co-located POIs
def test_counts_pois_across_multiple_grid_cells_within_5km():
"""A POI around 4.8km away must not be dropped by grid candidate lookup."""
postcodes = pl.DataFrame(
{
"postcode": ["GRID 5KM"],
"lat": [51.5],
"lon": [0.049],
}
)
pois = pl.DataFrame(
{
"lat": [51.5, 51.5],
"lng": [0.1183, 0.1240],
"category": ["Park", "Park"],
}
)
result = count_pois_per_postcode(
postcodes,
pois,
groups={"parks": ["Park"]},
radius_km=5.0,
)
assert result["parks_5km"][0] == 1
def test_min_distance_finds_nearest(postcodes, pois):
"""min_distance_per_postcode returns distance to closest POI per group."""
result = min_distance_per_postcode(postcodes, pois, groups=POI_GROUPS)