perfect-postcode/pipeline/utils/test_haversine.py
2026-01-31 13:07:09 +00:00

147 lines
5.4 KiB
Python

import numpy as np
import polars as pl
import pytest
from pipeline.utils.haversine import haversine_km, haversine_km_expr
class TestHaversineKm:
"""Test numpy-based haversine distance calculation."""
def test_same_point(self):
"""Distance from a point to itself should be zero."""
lat = np.array([51.5074])
lon = np.array([-0.1278])
dist = haversine_km(lat, lon, 51.5074, -0.1278)
assert np.allclose(dist, 0.0, atol=1e-10)
def test_known_distance_london_to_paris(self):
"""Test distance from London to Paris (~344 km)."""
# London coordinates
london_lat = np.array([51.5074])
london_lon = np.array([-0.1278])
# Paris coordinates
paris_lat = 48.8566
paris_lon = 2.3522
dist = haversine_km(london_lat, london_lon, paris_lat, paris_lon)
# Expected distance is approximately 344 km
assert np.allclose(dist[0], 344, rtol=0.01)
def test_known_distance_new_york_to_london(self):
"""Test distance from New York to London (~5570 km)."""
ny_lat = np.array([40.7128])
ny_lon = np.array([-74.0060])
london_lat = 51.5074
london_lon = -0.1278
dist = haversine_km(ny_lat, ny_lon, london_lat, london_lon)
# Expected distance is approximately 5570 km
assert np.allclose(dist[0], 5570, rtol=0.01)
def test_multiple_points(self):
"""Test calculating distances from multiple points to a single destination."""
lats = np.array([51.5074, 48.8566, 40.7128]) # London, Paris, NYC
lons = np.array([-0.1278, 2.3522, -74.0060])
# Distance to Edinburgh
edinburgh_lat = 55.9533
edinburgh_lon = -3.1883
dists = haversine_km(lats, lons, edinburgh_lat, edinburgh_lon)
# All distances should be positive
assert np.all(dists > 0)
# London to Edinburgh should be shortest (~530 km)
assert dists[0] < dists[1] < dists[2]
assert np.allclose(dists[0], 530, rtol=0.02)
def test_equator_points(self):
"""Test distance along the equator."""
# Two points on the equator, 1 degree apart
lat = np.array([0.0])
lon1 = np.array([0.0])
lon2 = 1.0
dist = haversine_km(lat, lon1, 0.0, lon2)
# 1 degree at equator ≈ 111 km
assert np.allclose(dist[0], 111.2, rtol=0.01)
class TestHaversineKmExpr:
"""Test Polars expression-based haversine distance calculation."""
def test_same_point(self):
"""Distance from a point to itself should be zero."""
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
result = df.select(
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
)
assert result["dist"][0] == pytest.approx(0.0, abs=1e-10)
def test_known_distance_london_to_paris(self):
"""Test distance from London to Paris (~344 km)."""
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
result = df.select(
haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist")
)
assert result["dist"][0] == pytest.approx(344, rel=0.01)
def test_known_distance_new_york_to_london(self):
"""Test distance from New York to London (~5570 km)."""
df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]})
result = df.select(
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
)
assert result["dist"][0] == pytest.approx(5570, rel=0.01)
def test_multiple_points(self):
"""Test calculating distances from multiple points to a single destination."""
df = pl.DataFrame(
{
"lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC
"lon": [-0.1278, 2.3522, -74.0060],
}
)
# Distance to Edinburgh
result = df.select(
haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist")
)
dists = result["dist"].to_numpy()
# All distances should be positive
assert np.all(dists > 0)
# London to Edinburgh should be shortest (~530 km)
assert dists[0] < dists[1] < dists[2]
assert dists[0] == pytest.approx(530, rel=0.02)
def test_equator_points(self):
"""Test distance along the equator."""
df = pl.DataFrame({"lat": [0.0], "lon": [0.0]})
result = df.select(haversine_km_expr("lat", "lon", 0.0, 1.0).alias("dist"))
# 1 degree at equator ≈ 111 km
assert result["dist"][0] == pytest.approx(111.2, rel=0.01)
class TestHaversineConsistency:
"""Test that both implementations give consistent results."""
def test_numpy_and_polars_match(self):
"""Both implementations should give identical results."""
# Test data
lats = np.array([51.5074, 48.8566, 40.7128, 55.9533, 52.5200])
lons = np.array([-0.1278, 2.3522, -74.0060, -3.1883, 13.4050])
dest_lat = 41.9028 # Rome
dest_lon = 12.4964
# Numpy version
numpy_dists = haversine_km(lats, lons, dest_lat, dest_lon)
# Polars version
df = pl.DataFrame({"lat": lats, "lon": lons})
polars_result = df.select(
haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist")
)
polars_dists = polars_result["dist"].to_numpy()
# Should be identical (or at least very close due to floating point)
assert np.allclose(numpy_dists, polars_dists, rtol=1e-10)