Format python

2026-01-31 13:07:09 +00:00 · 2026-01-31 13:07:09 +00:00 · 4c258018c3
commit 4c258018c3
parent 85f5770e09
17 changed files with 348 additions and 248 deletions
--- a/pipeline/utils/init.py
+++ b/pipeline/utils/init.py
@ -2,4 +2,10 @@ from .fuzzy_join import fuzzy_join_on_postcode
 from .haversine import haversine_km, haversine_km_expr
 from .poi_counts import POI_GROUPS, count_pois_within_radius

-__all__ = ["fuzzy_join_on_postcode", "haversine_km", "haversine_km_expr", "POI_GROUPS", "count_pois_within_radius"]
+__all__ = [
+    "fuzzy_join_on_postcode",
+    "haversine_km",
+    "haversine_km_expr",
+    "POI_GROUPS",
+    "count_pois_within_radius",
+]
--- a/pipeline/utils/fuzzy_join.py
+++ b/pipeline/utils/fuzzy_join.py
@ -9,14 +9,14 @@ import polars as pl
 from thefuzz import fuzz
 from tqdm import tqdm

-_NUMBER_RE = re.compile(r'\d+')
+_NUMBER_RE = re.compile(r"\d+")


 def _normalize(s: pl.Expr) -> pl.Expr:
    return (
        s.str.to_uppercase()
-        .str.replace_all(r'[,.\-]', ' ')
-        .str.replace_all(r'\s+', ' ')
+        .str.replace_all(r"[,.\-]", " ")
+        .str.replace_all(r"\s+", " ")
        .str.strip_chars()
    )

@ -40,22 +40,25 @@ def fuzzy_join_on_postcode(
    have null right columns.
    """

-    tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_')
-    left_path = Path(tmpdir) / 'left.parquet'
-    right_path = Path(tmpdir) / 'right.parquet'
+    tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
+    left_path = Path(tmpdir) / "left.parquet"
+    right_path = Path(tmpdir) / "right.parquet"

    try:
        # Materialise each side exactly once, with a row index, to temp parquet.
-        left.with_row_index('_left_idx').sink_parquet(left_path)
-        right.with_row_index('_right_idx').sink_parquet(right_path)
+        left.with_row_index("_left_idx").sink_parquet(left_path)
+        right.with_row_index("_right_idx").sink_parquet(right_path)

        # Collect only the narrow columns needed for matching (projection pushdown).
        left_match = (
            pl.scan_parquet(left_path)
            .select(
-                '_left_idx',
-                _normalize(pl.col(left_address_col)).alias('_left_address'),
-                pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
+                "_left_idx",
+                _normalize(pl.col(left_address_col)).alias("_left_address"),
+                pl.col(left_postcode_col)
+                .str.strip_chars()
+                .str.to_uppercase()
+                .alias("_left_postcode"),
            )
            .collect()
        )
@ -63,18 +66,23 @@ def fuzzy_join_on_postcode(
        right_match = (
            pl.scan_parquet(right_path)
            .select(
-                '_right_idx',
-                _normalize(pl.col(right_address_col)).alias('_right_address'),
-                pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
+                "_right_idx",
+                _normalize(pl.col(right_address_col)).alias("_right_address"),
+                pl.col(right_postcode_col)
+                .str.strip_chars()
+                .str.to_uppercase()
+                .alias("_right_postcode"),
            )
-            .unique(subset=['_right_address', '_right_postcode'], keep='first')
+            .unique(subset=["_right_address", "_right_postcode"], keep="first")
            .collect()
        )

        # Group right side by postcode for fast lookup
        right_by_postcode: dict[str, list[tuple[int, str]]] = {}
        for idx, postcode, address in zip(
-            right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address']
+            right_match["_right_idx"],
+            right_match["_right_postcode"],
+            right_match["_right_address"],
        ):
            if postcode is not None:
                right_by_postcode.setdefault(postcode, []).append((idx, address))
@ -82,7 +90,9 @@ def fuzzy_join_on_postcode(
        # Group left side by postcode
        left_by_postcode: dict[str, list[tuple[int, str]]] = {}
        for idx, postcode, address in zip(
-            left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address']
+            left_match["_left_idx"],
+            left_match["_left_postcode"],
+            left_match["_left_address"],
        ):
            if address is not None and postcode is not None:
                left_by_postcode.setdefault(postcode, []).append((idx, address))
@ -103,7 +113,7 @@ def fuzzy_join_on_postcode(
            for pairs in tqdm(
                executor.map(_score_bucket, tasks, chunksize=64),
                total=len(tasks),
-                desc='Fuzzy matching',
+                desc="Fuzzy matching",
            ):
                all_pairs.extend(pairs)

@ -127,24 +137,27 @@ def fuzzy_join_on_postcode(

        # Build a small mapping LazyFrame and join back to the cached parquets.
        if matches:
-            mapping = pl.LazyFrame({
-                '_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32),
-                '_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32),
-            })
+            mapping = pl.LazyFrame(
+                {
+                    "_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
+                    "_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
+                }
+            )
        else:
-            mapping = pl.LazyFrame({
-                '_left_idx': pl.Series([], dtype=pl.UInt32),
-                '_right_idx': pl.Series([], dtype=pl.UInt32),
-            })
+            mapping = pl.LazyFrame(
+                {
+                    "_left_idx": pl.Series([], dtype=pl.UInt32),
+                    "_right_idx": pl.Series([], dtype=pl.UInt32),
+                }
+            )

        left_cached = pl.scan_parquet(left_path)
        right_cached = pl.scan_parquet(right_path)

        return (
-            left_cached
-            .join(mapping, on='_left_idx', how='left')
-            .join(right_cached, on='_right_idx', how='left')
-            .drop('_left_idx', '_right_idx')
+            left_cached.join(mapping, on="_left_idx", how="left")
+            .join(right_cached, on="_right_idx", how="left")
+            .drop("_left_idx", "_right_idx")
        )
    except BaseException:
        shutil.rmtree(tmpdir, ignore_errors=True)
@ -158,7 +171,9 @@ def _numbers_compatible(a: str, b: str) -> bool:
    """
    nums_a = set(_NUMBER_RE.findall(a))
    nums_b = set(_NUMBER_RE.findall(b))
-    smaller, larger = (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
+    smaller, larger = (
+        (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
+    )
    if not smaller and larger:
        return False
    return smaller.issubset(larger)
--- a/pipeline/utils/haversine.py
+++ b/pipeline/utils/haversine.py
@ -6,7 +6,9 @@ import polars as pl
 _EARTH_RADIUS_KM = 6371.0


-def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -> np.ndarray:
+def haversine_km(
+    lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float
+) -> np.ndarray:
    """Compute haversine distance in km between arrays (lat1, lon1) and a single point (lat2, lon2)."""
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
@ -14,7 +16,10 @@ def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -
    lon2_rad = np.radians(lon2)
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
-    a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
+    a = (
+        np.sin(dlat / 2) ** 2
+        + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
+    )
    c = 2 * np.arcsin(np.sqrt(a))
    return _EARTH_RADIUS_KM * c

@ -32,5 +37,7 @@ def haversine_km_expr(
    dlat = pl.lit(dest_lat_rad) - lat_rad
    dlon = pl.lit(dest_lon_rad) - lon_rad

-    a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (dlon / 2).sin() ** 2
+    a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (
+        dlon / 2
+    ).sin() ** 2
    return 2 * _EARTH_RADIUS_KM * a.sqrt().arcsin()
--- a/pipeline/utils/poi_counts.py
+++ b/pipeline/utils/poi_counts.py
@ -70,7 +70,9 @@ def _count_pois_per_postcode(
    pc_codes = postcodes_df["postcode"].to_list()

    # Initialize result arrays
-    result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS}
+    result_counts = {
+        group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS
+    }

    # Process in batches with progress
    batch_size = 50000
@ -83,7 +85,9 @@ def _count_pois_per_postcode(
        end_idx = min(start_idx + batch_size, n_postcodes)

        if batch_idx % 5 == 0:
-            print(f"  Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}")
+            print(
+                f"  Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
+            )

        # Process batch
        for i in range(start_idx, end_idx):
@ -109,12 +113,7 @@ def _count_pois_per_postcode(
            nearby = np.concatenate(nearby_indices)

            # Vectorized distance calculation for all nearby POIs
-            distances = haversine_km(
-                poi_lats[nearby],
-                poi_lngs[nearby],
-                pc_lat,
-                pc_lon
-            )
+            distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lat, pc_lon)

            # Filter by radius
            within_mask = distances <= radius_km
@ -147,13 +146,13 @@ def count_pois_within_radius(
    """
    # Get unique postcodes with coordinates
    print("Deduplicating postcodes...")
-    unique_postcodes = (
-        properties
-        .select(["postcode", "lat", "lon"])
-        .unique(subset=["postcode"])
+    unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
+        subset=["postcode"]
    )

-    print(f"  {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes")
+    print(
+        f"  {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
+    )

    # Count POIs per postcode
    postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
@ -174,11 +173,7 @@ def count_pois_within_radius(
    result_lazy = (
        properties.lazy()
        .select("postcode")
-        .join(
-            pl.scan_parquet(tmp_path),
-            on="postcode",
-            how="left"
-        )
+        .join(pl.scan_parquet(tmp_path), on="postcode", how="left")
        .select(count_cols)
        .fill_null(0)
    )
--- a/pipeline/utils/test_fuzzy_join.py
+++ b/pipeline/utils/test_fuzzy_join.py
@ -41,6 +41,6 @@ result = fuzzy_join_on_postcode(

 snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")

-print('Testing the matching between EPC and PP addresses')
+print("Testing the matching between EPC and PP addresses")
 with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
    print(snapshot)
--- a/pipeline/utils/test_haversine.py
+++ b/pipeline/utils/test_haversine.py
@ -73,29 +73,39 @@ class TestHaversineKmExpr:
    def test_same_point(self):
        """Distance from a point to itself should be zero."""
        df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
-        result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist"))
+        result = df.select(
+            haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
+        )
        assert result["dist"][0] == pytest.approx(0.0, abs=1e-10)

    def test_known_distance_london_to_paris(self):
        """Test distance from London to Paris (~344 km)."""
        df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
-        result = df.select(haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist"))
+        result = df.select(
+            haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist")
+        )
        assert result["dist"][0] == pytest.approx(344, rel=0.01)

    def test_known_distance_new_york_to_london(self):
        """Test distance from New York to London (~5570 km)."""
        df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]})
-        result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist"))
+        result = df.select(
+            haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
+        )
        assert result["dist"][0] == pytest.approx(5570, rel=0.01)

    def test_multiple_points(self):
        """Test calculating distances from multiple points to a single destination."""
-        df = pl.DataFrame({
-            "lat": [51.5074, 48.8566, 40.7128],  # London, Paris, NYC
-            "lon": [-0.1278, 2.3522, -74.0060],
-        })
+        df = pl.DataFrame(
+            {
+                "lat": [51.5074, 48.8566, 40.7128],  # London, Paris, NYC
+                "lon": [-0.1278, 2.3522, -74.0060],
+            }
+        )
        # Distance to Edinburgh
-        result = df.select(haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist"))
+        result = df.select(
+            haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist")
+        )

        dists = result["dist"].to_numpy()
        # All distances should be positive
@ -128,7 +138,9 @@ class TestHaversineConsistency:

        # Polars version
        df = pl.DataFrame({"lat": lats, "lon": lons})
-        polars_result = df.select(haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist"))
+        polars_result = df.select(
+            haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist")
+        )
        polars_dists = polars_result["dist"].to_numpy()

        # Should be identical (or at least very close due to floating point)
--- a/pipeline/utils/test_poi_counts.py
+++ b/pipeline/utils/test_poi_counts.py
@ -7,28 +7,32 @@ from pipeline.utils.poi_counts import POI_GROUPS, count_pois_within_radius
@pytest.fixture
 def pois():
    """POIs clustered around two locations: central London and 10km away."""
-    return pl.DataFrame({
-        "lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
-        "lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
-        "category": [
-            "Restaurant",
-            "Fast Food",
-            "Supermarket",
-            "Park",
-            "Station",
-            "Restaurant",   # too far from any property
-        ],
-    })
+    return pl.DataFrame(
+        {
+            "lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
+            "lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
+            "category": [
+                "Restaurant",
+                "Fast Food",
+                "Supermarket",
+                "Park",
+                "Station",
+                "Restaurant",  # too far from any property
+            ],
+        }
+    )


@pytest.fixture
 def properties():
    """Two properties at the same postcode near central London, one at a distant postcode."""
-    return pl.DataFrame({
-        "postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
-        "lat": [51.5074, 51.5074, 55.0],
-        "lon": [-0.1278, -0.1278, -3.0],
-    })
+    return pl.DataFrame(
+        {
+            "postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
+            "lat": [51.5074, 51.5074, 55.0],
+            "lon": [-0.1278, -0.1278, -3.0],
+        }
+    )


 def test_counts_pois_within_radius(properties, pois):
@ -41,9 +45,9 @@ def test_counts_pois_within_radius(properties, pois):
        assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"

    # First two rows share a postcode near the central London cluster
-    assert result["restaurants_2km"][0] == 2   # Restaurant + Fast Food
-    assert result["groceries_2km"][0] == 1     # Supermarket
-    assert result["parks_2km"][0] == 1         # Park
+    assert result["restaurants_2km"][0] == 2  # Restaurant + Fast Food
+    assert result["groceries_2km"][0] == 1  # Supermarket
+    assert result["parks_2km"][0] == 1  # Park
    assert result["public_transport_2km"][0] == 1  # Station

    # Second row is the same postcode, so same counts
@ -55,11 +59,13 @@ def test_counts_pois_within_radius(properties, pois):


 def test_no_pois_returns_zeros(properties):
-    empty_pois = pl.DataFrame({
-        "lat": pl.Series([], dtype=pl.Float64),
-        "lng": pl.Series([], dtype=pl.Float64),
-        "category": pl.Series([], dtype=pl.String),
-    })
+    empty_pois = pl.DataFrame(
+        {
+            "lat": pl.Series([], dtype=pl.Float64),
+            "lng": pl.Series([], dtype=pl.Float64),
+            "category": pl.Series([], dtype=pl.String),
+        }
+    )
    result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)

    for group in POI_GROUPS:
@ -70,11 +76,13 @@ def test_no_pois_returns_zeros(properties):

 def test_custom_radius(pois):
    """A tiny radius should exclude POIs that are even slightly away."""
-    properties = pl.DataFrame({
-        "postcode": ["EC1A 1BB"],
-        "lat": [51.5074],
-        "lon": [-0.1278],
-    })
+    properties = pl.DataFrame(
+        {
+            "postcode": ["EC1A 1BB"],
+            "lat": [51.5074],
+            "lon": [-0.1278],
+        }
+    )

    # 0.01 km = 10m — only the POI at the exact same location should match
    result = count_pois_within_radius(properties, pois, radius_km=0.01)