Format python
This commit is contained in:
parent
85f5770e09
commit
4c258018c3
17 changed files with 348 additions and 248 deletions
|
|
@ -2,4 +2,10 @@ from .fuzzy_join import fuzzy_join_on_postcode
|
|||
from .haversine import haversine_km, haversine_km_expr
|
||||
from .poi_counts import POI_GROUPS, count_pois_within_radius
|
||||
|
||||
__all__ = ["fuzzy_join_on_postcode", "haversine_km", "haversine_km_expr", "POI_GROUPS", "count_pois_within_radius"]
|
||||
__all__ = [
|
||||
"fuzzy_join_on_postcode",
|
||||
"haversine_km",
|
||||
"haversine_km_expr",
|
||||
"POI_GROUPS",
|
||||
"count_pois_within_radius",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -9,14 +9,14 @@ import polars as pl
|
|||
from thefuzz import fuzz
|
||||
from tqdm import tqdm
|
||||
|
||||
_NUMBER_RE = re.compile(r'\d+')
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
|
||||
|
||||
def _normalize(s: pl.Expr) -> pl.Expr:
|
||||
return (
|
||||
s.str.to_uppercase()
|
||||
.str.replace_all(r'[,.\-]', ' ')
|
||||
.str.replace_all(r'\s+', ' ')
|
||||
.str.replace_all(r"[,.\-]", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
|
||||
|
|
@ -40,22 +40,25 @@ def fuzzy_join_on_postcode(
|
|||
have null right columns.
|
||||
"""
|
||||
|
||||
tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_')
|
||||
left_path = Path(tmpdir) / 'left.parquet'
|
||||
right_path = Path(tmpdir) / 'right.parquet'
|
||||
tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
|
||||
left_path = Path(tmpdir) / "left.parquet"
|
||||
right_path = Path(tmpdir) / "right.parquet"
|
||||
|
||||
try:
|
||||
# Materialise each side exactly once, with a row index, to temp parquet.
|
||||
left.with_row_index('_left_idx').sink_parquet(left_path)
|
||||
right.with_row_index('_right_idx').sink_parquet(right_path)
|
||||
left.with_row_index("_left_idx").sink_parquet(left_path)
|
||||
right.with_row_index("_right_idx").sink_parquet(right_path)
|
||||
|
||||
# Collect only the narrow columns needed for matching (projection pushdown).
|
||||
left_match = (
|
||||
pl.scan_parquet(left_path)
|
||||
.select(
|
||||
'_left_idx',
|
||||
_normalize(pl.col(left_address_col)).alias('_left_address'),
|
||||
pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
|
||||
"_left_idx",
|
||||
_normalize(pl.col(left_address_col)).alias("_left_address"),
|
||||
pl.col(left_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_left_postcode"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
|
@ -63,18 +66,23 @@ def fuzzy_join_on_postcode(
|
|||
right_match = (
|
||||
pl.scan_parquet(right_path)
|
||||
.select(
|
||||
'_right_idx',
|
||||
_normalize(pl.col(right_address_col)).alias('_right_address'),
|
||||
pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
|
||||
"_right_idx",
|
||||
_normalize(pl.col(right_address_col)).alias("_right_address"),
|
||||
pl.col(right_postcode_col)
|
||||
.str.strip_chars()
|
||||
.str.to_uppercase()
|
||||
.alias("_right_postcode"),
|
||||
)
|
||||
.unique(subset=['_right_address', '_right_postcode'], keep='first')
|
||||
.unique(subset=["_right_address", "_right_postcode"], keep="first")
|
||||
.collect()
|
||||
)
|
||||
|
||||
# Group right side by postcode for fast lookup
|
||||
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for idx, postcode, address in zip(
|
||||
right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address']
|
||||
right_match["_right_idx"],
|
||||
right_match["_right_postcode"],
|
||||
right_match["_right_address"],
|
||||
):
|
||||
if postcode is not None:
|
||||
right_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||
|
|
@ -82,7 +90,9 @@ def fuzzy_join_on_postcode(
|
|||
# Group left side by postcode
|
||||
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
|
||||
for idx, postcode, address in zip(
|
||||
left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address']
|
||||
left_match["_left_idx"],
|
||||
left_match["_left_postcode"],
|
||||
left_match["_left_address"],
|
||||
):
|
||||
if address is not None and postcode is not None:
|
||||
left_by_postcode.setdefault(postcode, []).append((idx, address))
|
||||
|
|
@ -103,7 +113,7 @@ def fuzzy_join_on_postcode(
|
|||
for pairs in tqdm(
|
||||
executor.map(_score_bucket, tasks, chunksize=64),
|
||||
total=len(tasks),
|
||||
desc='Fuzzy matching',
|
||||
desc="Fuzzy matching",
|
||||
):
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
|
|
@ -127,24 +137,27 @@ def fuzzy_join_on_postcode(
|
|||
|
||||
# Build a small mapping LazyFrame and join back to the cached parquets.
|
||||
if matches:
|
||||
mapping = pl.LazyFrame({
|
||||
'_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32),
|
||||
'_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32),
|
||||
})
|
||||
mapping = pl.LazyFrame(
|
||||
{
|
||||
"_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
|
||||
"_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
|
||||
}
|
||||
)
|
||||
else:
|
||||
mapping = pl.LazyFrame({
|
||||
'_left_idx': pl.Series([], dtype=pl.UInt32),
|
||||
'_right_idx': pl.Series([], dtype=pl.UInt32),
|
||||
})
|
||||
mapping = pl.LazyFrame(
|
||||
{
|
||||
"_left_idx": pl.Series([], dtype=pl.UInt32),
|
||||
"_right_idx": pl.Series([], dtype=pl.UInt32),
|
||||
}
|
||||
)
|
||||
|
||||
left_cached = pl.scan_parquet(left_path)
|
||||
right_cached = pl.scan_parquet(right_path)
|
||||
|
||||
return (
|
||||
left_cached
|
||||
.join(mapping, on='_left_idx', how='left')
|
||||
.join(right_cached, on='_right_idx', how='left')
|
||||
.drop('_left_idx', '_right_idx')
|
||||
left_cached.join(mapping, on="_left_idx", how="left")
|
||||
.join(right_cached, on="_right_idx", how="left")
|
||||
.drop("_left_idx", "_right_idx")
|
||||
)
|
||||
except BaseException:
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
|
@ -158,7 +171,9 @@ def _numbers_compatible(a: str, b: str) -> bool:
|
|||
"""
|
||||
nums_a = set(_NUMBER_RE.findall(a))
|
||||
nums_b = set(_NUMBER_RE.findall(b))
|
||||
smaller, larger = (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
|
||||
smaller, larger = (
|
||||
(nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
|
||||
)
|
||||
if not smaller and larger:
|
||||
return False
|
||||
return smaller.issubset(larger)
|
||||
|
|
|
|||
|
|
@ -6,7 +6,9 @@ import polars as pl
|
|||
_EARTH_RADIUS_KM = 6371.0
|
||||
|
||||
|
||||
def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -> np.ndarray:
|
||||
def haversine_km(
|
||||
lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float
|
||||
) -> np.ndarray:
|
||||
"""Compute haversine distance in km between arrays (lat1, lon1) and a single point (lat2, lon2)."""
|
||||
lat1_rad = np.radians(lat1)
|
||||
lon1_rad = np.radians(lon1)
|
||||
|
|
@ -14,7 +16,10 @@ def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -
|
|||
lon2_rad = np.radians(lon2)
|
||||
dlat = lat2_rad - lat1_rad
|
||||
dlon = lon2_rad - lon1_rad
|
||||
a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
|
||||
a = (
|
||||
np.sin(dlat / 2) ** 2
|
||||
+ np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
|
||||
)
|
||||
c = 2 * np.arcsin(np.sqrt(a))
|
||||
return _EARTH_RADIUS_KM * c
|
||||
|
||||
|
|
@ -32,5 +37,7 @@ def haversine_km_expr(
|
|||
dlat = pl.lit(dest_lat_rad) - lat_rad
|
||||
dlon = pl.lit(dest_lon_rad) - lon_rad
|
||||
|
||||
a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (dlon / 2).sin() ** 2
|
||||
a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (
|
||||
dlon / 2
|
||||
).sin() ** 2
|
||||
return 2 * _EARTH_RADIUS_KM * a.sqrt().arcsin()
|
||||
|
|
|
|||
|
|
@ -70,7 +70,9 @@ def _count_pois_per_postcode(
|
|||
pc_codes = postcodes_df["postcode"].to_list()
|
||||
|
||||
# Initialize result arrays
|
||||
result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS}
|
||||
result_counts = {
|
||||
group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS
|
||||
}
|
||||
|
||||
# Process in batches with progress
|
||||
batch_size = 50000
|
||||
|
|
@ -83,7 +85,9 @@ def _count_pois_per_postcode(
|
|||
end_idx = min(start_idx + batch_size, n_postcodes)
|
||||
|
||||
if batch_idx % 5 == 0:
|
||||
print(f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}")
|
||||
print(
|
||||
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
|
||||
)
|
||||
|
||||
# Process batch
|
||||
for i in range(start_idx, end_idx):
|
||||
|
|
@ -109,12 +113,7 @@ def _count_pois_per_postcode(
|
|||
nearby = np.concatenate(nearby_indices)
|
||||
|
||||
# Vectorized distance calculation for all nearby POIs
|
||||
distances = haversine_km(
|
||||
poi_lats[nearby],
|
||||
poi_lngs[nearby],
|
||||
pc_lat,
|
||||
pc_lon
|
||||
)
|
||||
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lat, pc_lon)
|
||||
|
||||
# Filter by radius
|
||||
within_mask = distances <= radius_km
|
||||
|
|
@ -147,13 +146,13 @@ def count_pois_within_radius(
|
|||
"""
|
||||
# Get unique postcodes with coordinates
|
||||
print("Deduplicating postcodes...")
|
||||
unique_postcodes = (
|
||||
properties
|
||||
.select(["postcode", "lat", "lon"])
|
||||
.unique(subset=["postcode"])
|
||||
unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
|
||||
subset=["postcode"]
|
||||
)
|
||||
|
||||
print(f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes")
|
||||
print(
|
||||
f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
|
||||
)
|
||||
|
||||
# Count POIs per postcode
|
||||
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
|
||||
|
|
@ -174,11 +173,7 @@ def count_pois_within_radius(
|
|||
result_lazy = (
|
||||
properties.lazy()
|
||||
.select("postcode")
|
||||
.join(
|
||||
pl.scan_parquet(tmp_path),
|
||||
on="postcode",
|
||||
how="left"
|
||||
)
|
||||
.join(pl.scan_parquet(tmp_path), on="postcode", how="left")
|
||||
.select(count_cols)
|
||||
.fill_null(0)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -41,6 +41,6 @@ result = fuzzy_join_on_postcode(
|
|||
|
||||
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
||||
|
||||
print('Testing the matching between EPC and PP addresses')
|
||||
print("Testing the matching between EPC and PP addresses")
|
||||
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
|
||||
print(snapshot)
|
||||
|
|
|
|||
|
|
@ -73,29 +73,39 @@ class TestHaversineKmExpr:
|
|||
def test_same_point(self):
|
||||
"""Distance from a point to itself should be zero."""
|
||||
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
|
||||
result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist"))
|
||||
result = df.select(
|
||||
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
|
||||
)
|
||||
assert result["dist"][0] == pytest.approx(0.0, abs=1e-10)
|
||||
|
||||
def test_known_distance_london_to_paris(self):
|
||||
"""Test distance from London to Paris (~344 km)."""
|
||||
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
|
||||
result = df.select(haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist"))
|
||||
result = df.select(
|
||||
haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist")
|
||||
)
|
||||
assert result["dist"][0] == pytest.approx(344, rel=0.01)
|
||||
|
||||
def test_known_distance_new_york_to_london(self):
|
||||
"""Test distance from New York to London (~5570 km)."""
|
||||
df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]})
|
||||
result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist"))
|
||||
result = df.select(
|
||||
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
|
||||
)
|
||||
assert result["dist"][0] == pytest.approx(5570, rel=0.01)
|
||||
|
||||
def test_multiple_points(self):
|
||||
"""Test calculating distances from multiple points to a single destination."""
|
||||
df = pl.DataFrame({
|
||||
"lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC
|
||||
"lon": [-0.1278, 2.3522, -74.0060],
|
||||
})
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC
|
||||
"lon": [-0.1278, 2.3522, -74.0060],
|
||||
}
|
||||
)
|
||||
# Distance to Edinburgh
|
||||
result = df.select(haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist"))
|
||||
result = df.select(
|
||||
haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist")
|
||||
)
|
||||
|
||||
dists = result["dist"].to_numpy()
|
||||
# All distances should be positive
|
||||
|
|
@ -128,7 +138,9 @@ class TestHaversineConsistency:
|
|||
|
||||
# Polars version
|
||||
df = pl.DataFrame({"lat": lats, "lon": lons})
|
||||
polars_result = df.select(haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist"))
|
||||
polars_result = df.select(
|
||||
haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist")
|
||||
)
|
||||
polars_dists = polars_result["dist"].to_numpy()
|
||||
|
||||
# Should be identical (or at least very close due to floating point)
|
||||
|
|
|
|||
|
|
@ -7,28 +7,32 @@ from pipeline.utils.poi_counts import POI_GROUPS, count_pois_within_radius
|
|||
@pytest.fixture
|
||||
def pois():
|
||||
"""POIs clustered around two locations: central London and 10km away."""
|
||||
return pl.DataFrame({
|
||||
"lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
|
||||
"lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
|
||||
"category": [
|
||||
"Restaurant",
|
||||
"Fast Food",
|
||||
"Supermarket",
|
||||
"Park",
|
||||
"Station",
|
||||
"Restaurant", # too far from any property
|
||||
],
|
||||
})
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
|
||||
"lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
|
||||
"category": [
|
||||
"Restaurant",
|
||||
"Fast Food",
|
||||
"Supermarket",
|
||||
"Park",
|
||||
"Station",
|
||||
"Restaurant", # too far from any property
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def properties():
|
||||
"""Two properties at the same postcode near central London, one at a distant postcode."""
|
||||
return pl.DataFrame({
|
||||
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
|
||||
"lat": [51.5074, 51.5074, 55.0],
|
||||
"lon": [-0.1278, -0.1278, -3.0],
|
||||
})
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
|
||||
"lat": [51.5074, 51.5074, 55.0],
|
||||
"lon": [-0.1278, -0.1278, -3.0],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_counts_pois_within_radius(properties, pois):
|
||||
|
|
@ -41,9 +45,9 @@ def test_counts_pois_within_radius(properties, pois):
|
|||
assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"
|
||||
|
||||
# First two rows share a postcode near the central London cluster
|
||||
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
||||
assert result["groceries_2km"][0] == 1 # Supermarket
|
||||
assert result["parks_2km"][0] == 1 # Park
|
||||
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
|
||||
assert result["groceries_2km"][0] == 1 # Supermarket
|
||||
assert result["parks_2km"][0] == 1 # Park
|
||||
assert result["public_transport_2km"][0] == 1 # Station
|
||||
|
||||
# Second row is the same postcode, so same counts
|
||||
|
|
@ -55,11 +59,13 @@ def test_counts_pois_within_radius(properties, pois):
|
|||
|
||||
|
||||
def test_no_pois_returns_zeros(properties):
|
||||
empty_pois = pl.DataFrame({
|
||||
"lat": pl.Series([], dtype=pl.Float64),
|
||||
"lng": pl.Series([], dtype=pl.Float64),
|
||||
"category": pl.Series([], dtype=pl.String),
|
||||
})
|
||||
empty_pois = pl.DataFrame(
|
||||
{
|
||||
"lat": pl.Series([], dtype=pl.Float64),
|
||||
"lng": pl.Series([], dtype=pl.Float64),
|
||||
"category": pl.Series([], dtype=pl.String),
|
||||
}
|
||||
)
|
||||
result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)
|
||||
|
||||
for group in POI_GROUPS:
|
||||
|
|
@ -70,11 +76,13 @@ def test_no_pois_returns_zeros(properties):
|
|||
|
||||
def test_custom_radius(pois):
|
||||
"""A tiny radius should exclude POIs that are even slightly away."""
|
||||
properties = pl.DataFrame({
|
||||
"postcode": ["EC1A 1BB"],
|
||||
"lat": [51.5074],
|
||||
"lon": [-0.1278],
|
||||
})
|
||||
properties = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["EC1A 1BB"],
|
||||
"lat": [51.5074],
|
||||
"lon": [-0.1278],
|
||||
}
|
||||
)
|
||||
|
||||
# 0.01 km = 10m — only the POI at the exact same location should match
|
||||
result = count_pois_within_radius(properties, pois, radius_km=0.01)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue