Move transform logic around

2026-01-31 12:48:29 +00:00 · 2026-01-31 12:48:29 +00:00 · 38b0cf1ea1
commit 38b0cf1ea1
parent e1b38a1b95
14 changed files with 1073 additions and 336 deletions
--- a/Taskfile.yml
+++ b/Taskfile.yml
@ -1,38 +1,136 @@
 version: '3'

+vars:
+  DATA_DIR: data
+  ARCGIS_OUTPUT: "{{.DATA_DIR}}/arcgis_data.parquet"
+  PRICE_PAID_OUTPUT: "{{.DATA_DIR}}/price-paid-complete.parquet"
+  IOD_OUTPUT: "{{.DATA_DIR}}/IoD2025_Scores.parquet"
+  POIS_RAW_OUTPUT: "{{.DATA_DIR}}/uk_pois.parquet"
+  POIS_FILTERED_OUTPUT: "{{.DATA_DIR}}/filtered_uk_pois.parquet"
+  POI_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/poi_proximity.parquet"
+  EPC_PP_OUTPUT: "{{.DATA_DIR}}/epc_pp.parquet"
+  WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
+  EPC_CSV: "{{.DATA_DIR}}/epc/certificates.csv"
+  JOURNEY_TIMES: "{{.DATA_DIR}}/journey_times_bank_checkpoint.parquet"
+
 tasks:
  install:
-    desc: Install dependencies, generate client, and download data
+    desc: Install dependencies
    cmds:
      - uv sync
      - cd frontend && npm install

-  download:
-    desc: Download data
-    deps:
-      - install
+  download:arcgis:
+    desc: Download and convert ArcGIS postcode data
+    sources:
+      - pipeline/download/arcgis.py
+    generates:
+      - "{{.ARCGIS_OUTPUT}}"
    cmds:
-      - uv run -m pipeline.download.arcgis
-      - uv run -m pipeline.download.pois
-      - uv run -m pipeline.download.deprivation_data
-      - uv run -m pipeline.download.price_paid
+      - uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}

-  pipeline:
-    desc: Run data processing pipeline
-    deps:
-      - download
+  download:price-paid:
+    desc: Download and convert Land Registry price-paid data
+    sources:
+      - pipeline/download/price_paid.py
+    generates:
+      - "{{.PRICE_PAID_OUTPUT}}"
    cmds:
-      - uv run python -m pipeline.run
+      - uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
+
+  download:deprivation:
+    desc: Download and convert Index of Deprivation data
+    sources:
+      - pipeline/download/deprivation_data.py
+    generates:
+      - "{{.IOD_OUTPUT}}"
+    cmds:
+      - uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
+
+  download:pois:
+    desc: Download and extract POIs from OpenStreetMap
+    sources:
+      - pipeline/download/pois.py
+    generates:
+      - "{{.POIS_RAW_OUTPUT}}"
+    cmds:
+      - uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
+
+  transform:pois:
+    desc: Transform raw POIs to filtered version with friendly names
+    deps:
+      - download:pois
+    sources:
+      - pipeline/transform/transform_poi.py
+      - "{{.POIS_RAW_OUTPUT}}"
+    generates:
+      - "{{.POIS_FILTERED_OUTPUT}}"
+    cmds:
+      - uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
+
+  transform:epc-pp:
+    desc: Fuzzy join EPC and Price Paid data
+    deps:
+      - download:price-paid
+    sources:
+      - pipeline/transform/join_epc_pp.py
+      - pipeline/utils/fuzzy_join.py
+      - "{{.PRICE_PAID_OUTPUT}}"
+      - "{{.EPC_CSV}}"
+    generates:
+      - "{{.EPC_PP_OUTPUT}}"
+    cmds:
+      - uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC_CSV}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
+
+  transform:poi-proximity:
+    desc: Compute POI proximity counts per postcode
+    deps:
+      - download:arcgis
+      - transform:pois
+    sources:
+      - pipeline/transform/poi_proximity.py
+      - pipeline/utils/poi_counts.py
+      - "{{.ARCGIS_OUTPUT}}"
+      - "{{.POIS_FILTERED_OUTPUT}}"
+    generates:
+      - "{{.POI_PROXIMITY_OUTPUT}}"
+    cmds:
+      - uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
+
+  transform:wide:
+    desc: Build wide property dataframe with all joins
+    deps:
+      - join:epc-pp
+      - download:arcgis
+      - download:deprivation
+      - transform:poi-proximity
+    sources:
+      - pipeline/transform/merge.py
+      - "{{.EPC_PP_OUTPUT}}"
+      - "{{.ARCGIS_OUTPUT}}"
+      - "{{.IOD_OUTPUT}}"
+      - "{{.POI_PROXIMITY_OUTPUT}}"
+    generates:
+      - "{{.WIDE_OUTPUT}}"
+    cmds:
+      - uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --output {{.WIDE_OUTPUT}}

  prepare:
    desc: Prepare the application (install, download data, run pipeline)
    deps:
-      - pipeline
+      - transform:wide
+
+  test:
+    cmds:
+      - uv run -m pipeline.utils.test_fuzzy_join
+      - uv run pytest pipeline/utils/test_haversine.py
+      - uv run pytest pipeline/utils/test_poi_counts.py

  server:
-    desc: Run FastAPI backend on port 8001
+    desc: Run Rust backend on port 8001
+    dir: server-rs
    cmds:
-      - uv run fastapi dev server/main.py --port 8001
+      - cargo run --release -- {{.WIDE_OUTPUT}}

  frontend:
    desc: Run frontend dev server on port 3030 (proxies /api to :8001)
@ -46,16 +144,13 @@ tasks:
    cmds:
      - npm run build

-  prod:
-    desc: Run production server (serves built frontend)
-    cmds:
-      - uv run fastapi run server/main.py --port 8001

  lint:
-    desc: Lint all code (Python and TypeScript)
+    desc: Lint all code (Python, TypeScript, and Rust)
    cmds:
      - task: lint:python
      - task: lint:frontend
+      - task: lint:rust

  lint:python:
    desc: Lint Python code with ruff
@ -69,11 +164,19 @@ tasks:
      - npm run lint
      - npm run format:check

+  lint:rust:
+    desc: Lint Rust code with clippy and check formatting
+    dir: server-rs
+    cmds:
+      - cargo clippy -- -D warnings
+      - cargo fmt --check
+
  format:
-    desc: Format all code (Python and TypeScript)
+    desc: Format all code (Python, TypeScript, and Rust)
    cmds:
      - task: format:python
      - task: format:frontend
+      - task: format:rust

  format:python:
    desc: Format Python code with ruff
@ -88,6 +191,12 @@ tasks:
      - npm run lint:fix
      - npm run format

+  format:rust:
+    desc: Format Rust code with cargo fmt
+    dir: server-rs
+    cmds:
+      - cargo fmt
+
  check:
    desc: Run all checks (lint, typecheck, build)
    cmds:
--- a/pipeline/base.py
+++ b/pipeline/base.py
@ -1,22 +0,0 @@
-from abc import ABC, abstractmethod
-import polars as pl
-
-
-class DataSource(ABC):
-    """Base class for all data sources."""
-
-    @property
-    @abstractmethod
-    def name(self) -> str:
-        """Unique identifier for this data source."""
-        pass
-
-    @abstractmethod
-    def load(self) -> pl.LazyFrame:
-        """Load raw data as LazyFrame."""
-        pass
-
-    @abstractmethod
-    def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
-        """Process and join with postcode coordinates."""
-        pass
--- a/pipeline/config.py
+++ b/pipeline/config.py
@ -1,12 +0,0 @@
-"""Shared configuration for the pipeline and server."""
-
-from pathlib import Path
-
-DATA_DIR = Path(__file__).parent.parent / "data_sources"
-PROCESSED_DIR = DATA_DIR / "processed"
-AGGREGATES_DIR = PROCESSED_DIR / "aggregates"
-
-# H3 resolutions to generate and serve
-# https://h3geo.org/docs/core-library/restable/#average-area-in-m2
-H3_RESOLUTIONS = [7, 8, 9, 10, 11]
-DEFAULT_H3_RESOLUTION = 8
--- a/pipeline/download/pois/main.py
+++ b/pipeline/download/pois/main.py
@ -8,17 +8,35 @@ import osmium
 import polars as pl
 from tqdm import tqdm

-from .config import (
-    BATCH_SIZE,
-    GEOFABRIK_GB_URL,
-    MIN_OCCURENCE_COUNT,
-    POI_TAG_KEYS,
-    UK_BBOX_EAST,
-    UK_BBOX_NORTH,
-    UK_BBOX_SOUTH,
-    UK_BBOX_WEST,
+from pathlib import Path
+
+
+BATCH_SIZE = 50_000
+
+MIN_OCCURENCE_COUNT = 20
+
+GEOFABRIK_GB_URL = (
+    "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
 )

+UK_BBOX_WEST = -7.57
+UK_BBOX_SOUTH = 49.96
+UK_BBOX_EAST = 1.68
+UK_BBOX_NORTH = 58.64
+
+POI_TAG_KEYS: list[str] = [
+    "amenity",
+    "building",
+    "craft",
+    "emergency",
+    "healthcare",
+    "leisure",
+    "office",
+    "shop",
+    "tourism",
+    "public_transport",
+]
+


 def download_pbf(pbf_file: Path) -> None:
@ -144,10 +162,9 @@ def main() -> None:
        )
        df = df.join(valid_categories.select("category"), on="category", how="semi")

-        args.output.parent.mkdir(parents=True, exist_ok=True)
+        print(f"Total POIs: {handler.poi_count:,}")
        df.sink_parquet(args.output)
        print(f"Saved to {args.output}")
-        print(f"Total POIs: {handler.poi_count:,}")


 if __name__ == "__main__":
--- a/pipeline/download/pois/init.py
+++ b/pipeline/download/pois/init.py
--- a/pipeline/download/pois/config.py
+++ b/pipeline/download/pois/config.py
@ -1,32 +0,0 @@
-from pathlib import Path
-
-DATA_DIR = Path("./data_sources")
-GB_PBF_FILE = DATA_DIR / "great-britain-latest.osm.pbf"
-OUTPUT_FILE = DATA_DIR / "uk_pois.parquet"
-
-
-BATCH_SIZE = 50_000
-
-MIN_OCCURENCE_COUNT = 20
-
-GEOFABRIK_GB_URL = (
-    "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
-)
-
-UK_BBOX_WEST = -7.57
-UK_BBOX_SOUTH = 49.96
-UK_BBOX_EAST = 1.68
-UK_BBOX_NORTH = 58.64
-
-POI_TAG_KEYS: list[str] = [
-    "amenity",
-    "building",
-    "craft",
-    "emergency",
-    "healthcare",
-    "leisure",
-    "office",
-    "shop",
-    "tourism",
-    "public_transport",
-]
--- a/pipeline/epc_pp.py
+++ b/pipeline/epc_pp.py
@ -1,85 +0,0 @@
-import polars as pl
-from .fuzzy_join import fuzzy_join_on_postcode
-
-
-pl.Config.set_tbl_cols(-1)
-
-
-
-epc = pl.scan_csv('data_sources/epc/certificates.csv').select(
-    pl.col('ADDRESS').alias('epc_address'),
-    'POSTCODE',
-    'CURRENT_ENERGY_RATING',
-    'POTENTIAL_ENERGY_RATING',
-    pl.col('PROPERTY_TYPE').alias('epc_property_type'),
-    'BUILT_FORM',
-    'INSPECTION_DATE',
-    'TOTAL_FLOOR_AREA',
-    'NUMBER_HABITABLE_ROOMS',
-    'FLOOR_HEIGHT',
-    'CONSTRUCTION_AGE_BAND'
-).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
-
-
-print("EPC dataset")
-print(epc.head().collect())
-
-# https://www.gov.uk/guidance/about-the-price-paid-data
-property_type_map = {"D": "Detached", "S": "Semi-Detached", "T": "Terraced", "F": "Flats/Maisonettes", "O": "Other"}
-duration_map = {"F": "Freehold", "L": "Leasehold"}
-
-price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
-    "price",
-    "date_of_transfer",
-    pl.col('property_type').alias("pp_property_type").replace(property_type_map),
-    "postcode",
-    'paon',
-    'saon',
-    'street',
-    'locality',
-    'town_city',
-    pl.col('duration').replace(duration_map)
-)
-.filter(pl.col('pp_property_type') != 'Other').with_columns(
-        pl.concat_str(
-            [pl.col('saon'), pl.col('paon'), pl.col('street')],
-            separator=' ',
-            ignore_nulls=True,
-        ).alias('pp_address'),
-    )
-    .sort('date_of_transfer')
-    .group_by('pp_address', 'postcode', maintain_order=True)
-    .agg(
-        pl.struct(
-            pl.col('date_of_transfer').dt.year().alias('year'),
-            'price',
-        ).alias('historical_prices'),
-        pl.col('pp_property_type').last(),
-        pl.col('duration').last(),
-        pl.col('price').last().alias('latest_price'),
-        pl.col('date_of_transfer').last(),
-    )   
-).filter(pl.col('pp_address').is_not_null())
-
-print("Price paid dataset")
-print(price_paid.head().collect())
-
-joined = fuzzy_join_on_postcode(
-    left=price_paid,
-    right=epc,
-    left_address_col='pp_address',
-    right_address_col='epc_address',
-    left_postcode_col='postcode',
-    right_postcode_col='POSTCODE',
-).drop('POSTCODE').collect()
-
-matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
-total = joined.height
-print(f"Unique properties: {total}")
-print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
-print(f"Unmatched: {total - matched.height}")
-
-matched = matched.rename({col: col.lower() for col in joined.columns})
-
-print(matched.head())
-matched.write_parquet('data_sources/processed/epc_pp.parquet')
--- a/pipeline/run.py
+++ b/pipeline/run.py
@ -1,6 +0,0 @@
-"""Pipeline CLI to process property data with H3 spatial indexing."""
-
-from pipeline.wide import run
-
-if __name__ == "__main__":
-    run()
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -0,0 +1,98 @@
+import argparse
+import polars as pl
+from pathlib import Path
+from ..utils import fuzzy_join_on_postcode
+
+
+pl.Config.set_tbl_cols(-1)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
+    parser.add_argument("--epc", type=Path, required=True, help="EPC certificates CSV file")
+    parser.add_argument("--price-paid", type=Path, required=True, help="Price paid parquet file")
+    parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
+    args = parser.parse_args()
+
+    epc = pl.scan_csv(args.epc).select(
+        pl.col('ADDRESS').alias('epc_address'),
+        'POSTCODE',
+        'CURRENT_ENERGY_RATING',
+        'POTENTIAL_ENERGY_RATING',
+        pl.col('PROPERTY_TYPE').alias('epc_property_type'),
+        'BUILT_FORM',
+        'INSPECTION_DATE',
+        'TOTAL_FLOOR_AREA',
+        'NUMBER_HABITABLE_ROOMS',
+        'FLOOR_HEIGHT',
+        'CONSTRUCTION_AGE_BAND'
+    ).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
+
+
+    print("EPC dataset")
+    print(epc.head().collect())
+
+    # https://www.gov.uk/guidance/about-the-price-paid-data
+    property_type_map = {"D": "Detached", "S": "Semi-Detached", "T": "Terraced", "F": "Flats/Maisonettes", "O": "Other"}
+    duration_map = {"F": "Freehold", "L": "Leasehold"}
+
+    price_paid = (pl.scan_parquet(args.price_paid).select(
+        "price",
+        "date_of_transfer",
+        pl.col('property_type').alias("pp_property_type").replace(property_type_map),
+        "postcode",
+        'paon',
+        'saon',
+        'street',
+        'locality',
+        'town_city',
+        pl.col('duration').replace(duration_map)
+    )
+    .filter(pl.col('pp_property_type') != 'Other').with_columns(
+            pl.concat_str(
+                [pl.col('saon'), pl.col('paon'), pl.col('street')],
+                separator=' ',
+                ignore_nulls=True,
+            ).alias('pp_address'),
+        )
+        .sort('date_of_transfer')
+        .group_by('pp_address', 'postcode', maintain_order=True)
+        .agg(
+            pl.struct(
+                pl.col('date_of_transfer').dt.year().alias('year'),
+                'price',
+            ).alias('historical_prices'),
+            pl.col('pp_property_type').last(),
+            pl.col('duration').last(),
+            pl.col('price').last().alias('latest_price'),
+            pl.col('date_of_transfer').last(),
+        )
+    ).filter(pl.col('pp_address').is_not_null())
+
+    print("Price paid dataset")
+    print(price_paid.head().collect())
+
+    joined = fuzzy_join_on_postcode(
+        left=price_paid,
+        right=epc,
+        left_address_col='pp_address',
+        right_address_col='epc_address',
+        left_postcode_col='postcode',
+        right_postcode_col='POSTCODE',
+    ).drop('POSTCODE').collect()
+
+    matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
+    total = joined.height
+    print(f"Unique properties: {total}")
+    print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
+    print(f"Unmatched: {total - matched.height}")
+
+    matched = matched.rename({col: col.lower() for col in joined.columns})
+
+    print(matched.head())
+    matched.write_parquet(args.output)
+    print(f"Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -0,0 +1,127 @@
+import argparse
+import polars as pl
+from pathlib import Path
+
+
+def _build_wide(
+    epc_pp_path: Path,
+    arcgis_path: Path,
+    iod_path: Path | None,
+    poi_proximity_path: Path | None,
+    journey_times_path: Path | None,
+) -> pl.DataFrame:
+    """Build the wide dataframe by joining epc_pp with all auxiliary data."""
+    print("Loading epc_pp...")
+    wide = pl.read_parquet(epc_pp_path)
+    print(f"  {wide.shape[0]:,} rows, {wide.estimated_size('mb'):.1f} MB")
+
+    # GPS coordinates + LSOA from ArcGIS
+    print("Joining GPS coordinates...")
+    arcgis = pl.read_parquet(arcgis_path).select(
+        pl.col("pcds").alias("postcode"),
+        "lat",
+        pl.col("long").alias("lon"),
+        "lsoa21",
+    )
+    wide = wide.join(arcgis, on="postcode", how="inner")
+    print(f"  {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB")
+
+    # Journey times (optional)
+    if journey_times_path and journey_times_path.exists():
+        print("Joining journey times...")
+        journey_times = pl.read_parquet(journey_times_path).select(
+            "postcode",
+            "public_transport_easy_minutes",
+            "public_transport_quick_minutes",
+            "cycling_minutes",
+        )
+        wide = wide.join(journey_times, on="postcode", how="left")
+        print(f"  {wide.estimated_size('mb'):.1f} MB after journey times")
+
+    # Index of Deprivation
+    if iod_path and iod_path.exists():
+        print("Joining IoD scores...")
+        iod = pl.read_parquet(iod_path)
+        wide = wide.join(
+            iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left"
+        )
+        print(f"  {wide.estimated_size('mb'):.1f} MB after IoD")
+
+    # POI proximity counts (pre-computed per postcode)
+    if poi_proximity_path and poi_proximity_path.exists():
+        print("Joining POI proximity counts...")
+        poi_counts = pl.read_parquet(poi_proximity_path)
+        wide = wide.join(poi_counts, on="postcode", how="left")
+        print(f"  {wide.estimated_size('mb'):.1f} MB after POI counts")
+
+    # Convert construction_age_band to numeric year
+    if "construction_age_band" in wide.columns:
+        wide = wide.with_columns(
+            pl.col("construction_age_band")
+            .str.replace("England and Wales: ", "")
+            .str.replace(" onwards", "")
+            .str.extract(r"(\d{4})", 1)
+            .cast(pl.UInt16, strict=False)
+            .alias("construction_age_band"),
+        )
+
+    # Derived columns
+    wide = wide.with_columns(
+        (pl.col("latest_price") / pl.col("total_floor_area")).alias("Price per sqm"),
+    ).drop(
+        'date_of_transfer',
+        'inspection_date',
+        'floor_height',
+        'lsoa21',
+        'LSOA code (2021)',
+        'Local Authority District code (2024)',
+        'Local Authority District name (2024)',
+        'imd_score',
+        'housing_barriers_score',
+        'idaci_score',
+        'idaopi_score',
+        'children_young_people_score',
+        'adult_skills_score',
+        'geographical_barriers_score',
+        'wider_barriers_score',
+    ).rename({
+        'construction_age_band': "Approximate construction age",
+        "income_score": "Income Score (rate)",
+        "employment_score": "Employment Score (rate)",
+        "education_score": "Education, Skills and Training Score",
+        "health_score": "Health Deprivation and Disability Score",
+        "crime_score": "Crime Score",
+    })
+
+    return wide
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Build wide property dataframe with all joins")
+    parser.add_argument("--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file")
+    parser.add_argument("--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file")
+    parser.add_argument("--iod", type=Path, help="Index of Deprivation parquet file (optional)")
+    parser.add_argument("--poi-proximity", type=Path, help="POI proximity counts parquet file (optional)")
+    parser.add_argument("--journey-times", type=Path, help="Journey times parquet file (optional)")
+    parser.add_argument("--output", type=Path, required=True, help="Output parquet file path")
+    args = parser.parse_args()
+
+    wide = _build_wide(
+        epc_pp_path=args.epc_pp,
+        arcgis_path=args.arcgis,
+        iod_path=args.iod,
+        poi_proximity_path=args.poi_proximity,
+        journey_times_path=args.journey_times,
+    )
+
+    print(f"Columns: {wide.columns}")
+    print(f"Rows: {wide.height}")
+
+    wide.write_parquet(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    
+    print(f"Wrote {args.output} ({size_mb:.1f} MB)")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -0,0 +1,42 @@
+"""Compute POI proximity counts per postcode from ArcGIS + filtered POIs."""
+
+import argparse
+from pathlib import Path
+
+import polars as pl
+
+from pipeline.utils.poi_counts import _count_pois_per_postcode
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Count POIs within radius per postcode"
+    )
+    parser.add_argument(
+        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
+    )
+    parser.add_argument(
+        "--pois", type=Path, required=True, help="Filtered POIs parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet path"
+    )
+    args = parser.parse_args()
+
+    postcodes = pl.read_parquet(args.arcgis).select(
+        pl.col("pcds").alias("postcode"),
+        "lat",
+        pl.col("long").alias("lon"),
+    )
+
+    pois = pl.read_parquet(args.pois)
+
+    result = _count_pois_per_postcode(postcodes, pois, radius_km=2)
+
+    result.write_parquet(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(f"Wrote {args.output} ({size_mb:.1f} MB)")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -0,0 +1,644 @@
+import argparse
+from pathlib import Path
+
+import polars as pl
+
+
+DROP_CATEGORIES = {
+    "amenity/advice",
+    "amenity/atm",
+    "amenity/bbq",
+    "amenity/bench",
+    "amenity/bicycle_parking",
+    "amenity/clock",
+    "amenity/fixme",
+    "amenity/grit_bin",
+    "amenity/hunting_stand",
+    "amenity/motorcycle_parking",
+    "amenity/notice_board",
+    "amenity/parking",
+    "amenity/parking_entrance",
+    "amenity/parking_space",
+    "amenity/post_box",
+    "amenity/telephone",
+    "amenity/toilets",
+    "amenity/vacuum_cleaner",
+    "amenity/waste_basket",
+    "building/air_shaft",
+    "building/apartments",
+    "building/detached",
+    "building/entrance",
+    "building/entry",
+    "building/garage",
+    "building/garages",
+    "building/house",
+    "building/hut",
+    "building/no",
+    "building/office",
+    "building/public",
+    "building/residential",
+    "building/roof",
+    "building/shed",
+    "building/terrace",
+    "building/yes",
+    "emergency/access_point",
+    "emergency/ambulance_station",
+    "emergency/assembly_point",
+    "emergency/bleed_control_kit",
+    "emergency/defibrillator",
+    "emergency/designated",
+    "emergency/dry_riser_inlet",
+    "emergency/emergency_ward_entrance",
+    "emergency/fire_alarm_box",
+    "emergency/fire_extinguisher",
+    "emergency/fire_hydrant",
+    "emergency/fire_service_inlet",
+    "emergency/first_aid_kit",
+    "emergency/life_ring",
+    "emergency/lifeguard",
+    "emergency/no",
+    "emergency/phone",
+    "emergency/rescue_equipment",
+    "emergency/siren",
+    "emergency/throw_bag",
+    "emergency/water_rescue",
+    "emergency/yes",
+    "leisure/firepit",
+    "leisure/fishing",
+    "leisure/picnic_table",
+    "office/company",
+    "office/yes",
+    "tourism/apartment",
+    "tourism/apartments",
+    "tourism/camp_pitch",
+    "tourism/information",
+    "tourism/village_sign",
+    "tourism/yes",
+}
+
+# (friendly_name, emoji) for every category we keep
+CATEGORY_MAP: dict[str, tuple[str, str]] = {
+    # amenity
+    "amenity/animal_boarding": ("Animal Boarding", "🐾"),
+    "amenity/animal_breeding": ("Animal Breeding", "🐣"),
+    "amenity/animal_shelter": ("Animal Shelter", "🏠"),
+    "amenity/arts_centre": ("Arts Centre", "🎨"),
+    "amenity/bank": ("Bank", "🏦"),
+    "amenity/bar": ("Bar", "🍸"),
+    "amenity/bicycle_rental": ("Bike Rental", "🚲"),
+    "amenity/bicycle_repair_station": ("Bike Repair", "🔧"),
+    "amenity/binoculars": ("Public Binoculars", "🔭"),
+    "amenity/boat_rental": ("Boat Rental", "⛵"),
+    "amenity/boat_storage": ("Boat Storage", "🚢"),
+    "amenity/boot_scraper": ("Boot Scraper", "🥾"),
+    "amenity/bureau_de_change": ("Currency Exchange", "💱"),
+    "amenity/bus_station": ("Bus Station", "🚌"),
+    "amenity/cafe": ("Café", "☕"),
+    "amenity/car_rental": ("Car Rental", "🚗"),
+    "amenity/car_sharing": ("Car Sharing", "🚙"),
+    "amenity/car_wash": ("Car Wash", "🧽"),
+    "amenity/care_home": ("Care Home", "🏥"),
+    "amenity/casino": ("Casino", "🎰"),
+    "amenity/charging_station": ("EV Charging", "🔌"),
+    "amenity/check_in": ("Check-In Point", "✅"),
+    "amenity/childcare": ("Childcare", "👶"),
+    "amenity/cinema": ("Cinema", "🎬"),
+    "amenity/clinic": ("Clinic", "🩺"),
+    "amenity/club": ("Club", "🏛️"),
+    "amenity/college": ("College", "🎓"),
+    "amenity/community_centre": ("Community Centre", "🤝"),
+    "amenity/compressed_air": ("Compressed Air", "💨"),
+    "amenity/conference_centre": ("Conference Centre", "📋"),
+    "amenity/courthouse": ("Courthouse", "⚖️"),
+    "amenity/coworking_space": ("Co-working Space", "💻"),
+    "amenity/crematorium": ("Crematorium", "🕯️"),
+    "amenity/dancing_school": ("Dance School", "💃"),
+    "amenity/dentist": ("Dentist", "🦷"),
+    "amenity/doctors": ("Doctor", "👨‍⚕️"),
+    "amenity/dojo": ("Dojo", "🥋"),
+    "amenity/donation_box": ("Donation Box", "📦"),
+    "amenity/dressing_room": ("Dressing Room", "👗"),
+    "amenity/drinking_water": ("Drinking Water", "🚰"),
+    "amenity/driving_school": ("Driving School", "🚦"),
+    "amenity/escooter_rental": ("E-Scooter Rental", "🛴"),
+    "amenity/events_venue": ("Events Venue", "🎪"),
+    "amenity/fast_food": ("Fast Food", "🍔"),
+    "amenity/feeding_place": ("Feeding Place", "🍽️"),
+    "amenity/ferry_terminal": ("Ferry Terminal", "⛴️"),
+    "amenity/fire_station": ("Fire Station", "🚒"),
+    "amenity/food_court": ("Food Court", "🍴"),
+    "amenity/fountain": ("Fountain", "⛲"),
+    "amenity/fuel": ("Fuel Station", "⛽"),
+    "amenity/gambling": ("Gambling", "🎲"),
+    "amenity/grave_yard": ("Graveyard", "🪦"),
+    "amenity/hall": ("Hall", "🏛️"),
+    "amenity/hookah_lounge": ("Hookah Lounge", "💨"),
+    "amenity/hospital": ("Hospital", "🏥"),
+    "amenity/ice_cream": ("Ice Cream", "🍦"),
+    "amenity/internet_cafe": ("Internet Café", "🌐"),
+    "amenity/kick-scooter_rental": ("Kick Scooter Rental", "🛴"),
+    "amenity/kindergarten": ("Kindergarten", "💒"),
+    "amenity/language_school": ("Language School", "🗣️"),
+    "amenity/letter_box": ("Letter Box", "📮"),
+    "amenity/library": ("Library", "📚"),
+    "amenity/loading_dock": ("Loading Dock", "📥"),
+    "amenity/lounge": ("Lounge", "🛋️"),
+    "amenity/lounger": ("Public Lounger", "🪑"),
+    "amenity/marketplace": ("Market", "🛒"),
+    "amenity/money_transfer": ("Money Transfer", "💸"),
+    "amenity/mounting_block": ("Mounting Block", "🐴"),
+    "amenity/music_school": ("Music School", "🎵"),
+    "amenity/music_venue": ("Music Venue", "🎶"),
+    "amenity/nightclub": ("Nightclub", "🪩"),
+    "amenity/nursing_home": ("Nursing Home", "🏠"),
+    "amenity/parcel_locker": ("Parcel Locker", "📦"),
+    "amenity/payment_terminal": ("Payment Terminal", "💳"),
+    "amenity/pharmacy": ("Pharmacy", "💊"),
+    "amenity/photo_booth": ("Photo Booth", "📸"),
+    "amenity/piano": ("Public Piano", "🎹"),
+    "amenity/place_of_worship": ("Place of Worship", "⛪"),
+    "amenity/police": ("Police Station", "🚔"),
+    "amenity/post_depot": ("Post Depot", "📬"),
+    "amenity/post_office": ("Post Office", "🏤"),
+    "amenity/prep_school": ("Prep School", "📖"),
+    "amenity/pub": ("Pub", "🍺"),
+    "amenity/public_bookcase": ("Public Bookcase", "📕"),
+    "amenity/public_building": ("Public Building", "🏢"),
+    "amenity/reception_desk": ("Reception Desk", "🛎️"),
+    "amenity/recycling": ("Recycling", "♻️"),
+    "amenity/restaurant": ("Restaurant", "🍽️"),
+    "amenity/sanitary_dump_station": ("Sanitary Dump Station", "🚿"),
+    "amenity/school": ("School", "🏫"),
+    "amenity/scout_hut": ("Scout Hut", "⚜️"),
+    "amenity/shelter": ("Shelter", "🛖"),
+    "amenity/shower": ("Public Shower", "🚿"),
+    "amenity/smoking_area": ("Smoking Area", "🚬"),
+    "amenity/social_centre": ("Social Centre", "🏘️"),
+    "amenity/social_club": ("Social Club", "🤝"),
+    "amenity/social_facility": ("Social Facility", "🫂"),
+    "amenity/stripclub": ("Strip Club", "🔞"),
+    "amenity/studio": ("Studio", "🎙️"),
+    "amenity/table": ("Public Table", "🪑"),
+    "amenity/taxi": ("Taxi Stand", "🚕"),
+    "amenity/telescope": ("Public Telescope", "🔭"),
+    "amenity/theatre": ("Theatre", "🎭"),
+    "amenity/ticket_validator": ("Ticket Validator", "🎫"),
+    "amenity/townhall": ("Town Hall", "🏛️"),
+    "amenity/training": ("Training Centre", "📝"),
+    "amenity/trolley_bay": ("Trolley Bay", "🛒"),
+    "amenity/university": ("University", "🏫"),
+    "amenity/vehicle_inspection": ("Vehicle Inspection", "🔍"),
+    "amenity/vending_machine": ("Vending Machine", "🏧"),
+    "amenity/veterinary": ("Vet", "🐕"),
+    "amenity/washing_machine": ("Washing Machine", "🧺"),
+    "amenity/washingline": ("Washing Line", "👕"),
+    "amenity/waste_disposal": ("Waste Disposal", "🗑️"),
+    "amenity/waste_transfer_station": ("Waste Transfer Station", "🚛"),
+    "amenity/water_point": ("Water Point", "💧"),
+    "amenity/watering_place": ("Watering Place", "🚰"),
+    "amenity/weighbridge": ("Weighbridge", "⚖️"),
+    # building
+    "building/barn": ("Barn", "🏚️"),
+    "building/bunker": ("Bunker", "🏗️"),
+    "building/chapel": ("Chapel", "⛪"),
+    "building/church": ("Church", "⛪"),
+    "building/commercial": ("Commercial Building", "🏬"),
+    "building/construction": ("Construction Site", "🚧"),
+    "building/farm": ("Farmhouse", "🌾"),
+    "building/greenhouse": ("Greenhouse", "🌿"),
+    "building/industrial": ("Industrial Building", "🏭"),
+    "building/kiosk": ("Kiosk", "🏪"),
+    "building/retail": ("Retail Building", "🏬"),
+    "building/ruins": ("Ruins", "🏚️"),
+    "building/school": ("School Building", "🏫"),
+    "building/semidetached_house": ("Semi-Detached House", "🏠"),
+    "building/service": ("Service Building", "🔧"),
+    "building/university": ("University Building", "🎓"),
+    "building/warehouse": ("Warehouse", "🏭"),
+    # craft
+    "craft/agricultural_engines": ("Agricultural Engines", "🚜"),
+    "craft/atelier": ("Atelier", "🎨"),
+    "craft/blacksmith": ("Blacksmith", "🔨"),
+    "craft/bookbinder": ("Bookbinder", "📖"),
+    "craft/brewery": ("Brewery", "🍺"),
+    "craft/builder": ("Builder", "🧱"),
+    "craft/carpenter": ("Carpenter", "🪚"),
+    "craft/caterer": ("Caterer", "🍱"),
+    "craft/cleaning": ("Cleaning Service", "🧹"),
+    "craft/confectionery": ("Confectioner", "🍬"),
+    "craft/distillery": ("Distillery", "🥃"),
+    "craft/dressmaker": ("Dressmaker", "👗"),
+    "craft/electrician": ("Electrician", "⚡"),
+    "craft/electronics_repair": ("Electronics Repair", "🔌"),
+    "craft/floorer": ("Flooring Specialist", "🪵"),
+    "craft/gardener": ("Gardener", "🌱"),
+    "craft/glaziery": ("Glazier", "🪟"),
+    "craft/handicraft": ("Handicraft", "✂️"),
+    "craft/hvac": ("HVAC", "❄️"),
+    "craft/jeweller": ("Jeweller", "💎"),
+    "craft/joiner": ("Joiner", "🪚"),
+    "craft/key_cutter": ("Key Cutter", "🔑"),
+    "craft/locksmith": ("Locksmith", "🔐"),
+    "craft/metal_construction": ("Metal Fabrication", "🔩"),
+    "craft/painter": ("Painter & Decorator", "🖌️"),
+    "craft/photographer": ("Photographer", "📷"),
+    "craft/photographic_laboratory": ("Photo Lab", "🖼️"),
+    "craft/plumber": ("Plumber", "🔧"),
+    "craft/pottery": ("Pottery", "🏺"),
+    "craft/printer": ("Printer", "🖨️"),
+    "craft/roofer": ("Roofer", "🏠"),
+    "craft/sawmill": ("Sawmill", "🪵"),
+    "craft/scaffolder": ("Scaffolder", "🏗️"),
+    "craft/sculptor": ("Sculptor", "🗿"),
+    "craft/shoemaker": ("Shoemaker", "👞"),
+    "craft/signmaker": ("Sign Maker", "🪧"),
+    "craft/stonemason": ("Stonemason", "🪨"),
+    "craft/tailor": ("Tailor", "🧵"),
+    "craft/upholsterer": ("Upholsterer", "🛋️"),
+    "craft/watchmaker": ("Watchmaker", "⌚"),
+    "craft/window_construction": ("Window Fitter", "🪟"),
+    "craft/winery": ("Winery", "🍷"),
+    "craft/yes": ("Craft Workshop", "🛠️"),
+    # healthcare
+    "healthcare/alternative": ("Alternative Medicine", "🌿"),
+    "healthcare/audiologist": ("Audiologist", "👂"),
+    "healthcare/centre": ("Health Centre", "🏥"),
+    "healthcare/clinic": ("Health Clinic", "🩺"),
+    "healthcare/counselling": ("Counselling", "🧠"),
+    "healthcare/dentist": ("Dental Practice", "🦷"),
+    "healthcare/doctor": ("GP Surgery", "👨‍⚕️"),
+    "healthcare/hospital": ("Hospital", "🏥"),
+    "healthcare/laboratory": ("Medical Lab", "🔬"),
+    "healthcare/optometrist": ("Optometrist", "👁️"),
+    "healthcare/pharmacy": ("Pharmacy", "💊"),
+    "healthcare/physiotherapist": ("Physiotherapist", "🏃"),
+    "healthcare/podiatrist": ("Podiatrist", "🦶"),
+    "healthcare/psychotherapist": ("Psychotherapist", "🧠"),
+    "healthcare/rehabilitation": ("Rehabilitation Centre", "♿"),
+    "healthcare/vaccination_centre": ("Vaccination Centre", "💉"),
+    "healthcare/yes": ("Healthcare Facility", "🏥"),
+    # leisure
+    "leisure/adult_gaming_centre": ("Adult Gaming Centre", "🎮"),
+    "leisure/amusement_arcade": ("Amusement Arcade", "🕹️"),
+    "leisure/bandstand": ("Bandstand", "🎺"),
+    "leisure/bathing_place": ("Bathing Spot", "🏖️"),
+    "leisure/bird_hide": ("Bird Hide", "🐦"),
+    "leisure/bowling_alley": ("Bowling Alley", "🎳"),
+    "leisure/common": ("Common Land", "🌳"),
+    "leisure/dance": ("Dance Venue", "💃"),
+    "leisure/dog_park": ("Dog Park", "🐕"),
+    "leisure/escape_game": ("Escape Room", "🔓"),
+    "leisure/fitness_centre": ("Gym", "🏋️"),
+    "leisure/fitness_station": ("Outdoor Gym", "💪"),
+    "leisure/garden": ("Garden", "🌷"),
+    "leisure/golf_course": ("Golf Course", "⛳"),
+    "leisure/hackerspace": ("Hackerspace", "💻"),
+    "leisure/horse_riding": ("Horse Riding", "🐎"),
+    "leisure/indoor_play": ("Indoor Play Area", "🧒"),
+    "leisure/marina": ("Marina", "⚓"),
+    "leisure/miniature_golf": ("Mini Golf", "⛳"),
+    "leisure/nature_reserve": ("Nature Reserve", "🦔"),
+    "leisure/outdoor_seating": ("Outdoor Seating", "🪑"),
+    "leisure/park": ("Park", "🌳"),
+    "leisure/pitch": ("Sports Pitch", "⚽"),
+    "leisure/playground": ("Playground", "🛝"),
+    "leisure/sauna": ("Sauna", "🧖"),
+    "leisure/slipway": ("Slipway", "🚤"),
+    "leisure/social_club": ("Social Club", "🍻"),
+    "leisure/sports_centre": ("Sports Centre", "🏟️"),
+    "leisure/sports_hall": ("Sports Hall", "🏀"),
+    "leisure/swimming_pool": ("Swimming Pool", "🏊"),
+    "leisure/tanning_salon": ("Tanning Salon", "☀️"),
+    "leisure/track": ("Running Track", "🏃"),
+    "leisure/trampoline_park": ("Trampoline Park", "🤸"),
+    "leisure/water_park": ("Water Park", "🌊"),
+    "leisure/wildlife_hide": ("Wildlife Hide", "🦌"),
+    "leisure/yes": ("Leisure Facility", "🎉"),
+    # office
+    "office/accountant": ("Accountant", "🧮"),
+    "office/advertising_agency": ("Advertising Agency", "📢"),
+    "office/architect": ("Architect", "📐"),
+    "office/association": ("Association", "🏛️"),
+    "office/charity": ("Charity", "❤️"),
+    "office/construction_company": ("Construction Company", "🏗️"),
+    "office/consulting": ("Consulting Firm", "📊"),
+    "office/courier": ("Courier Service", "📦"),
+    "office/coworking": ("Co-working Space", "💻"),
+    "office/design": ("Design Studio", "🎨"),
+    "office/diplomatic": ("Diplomatic Office", "🏛️"),
+    "office/educational_institution": ("Education Office", "🎓"),
+    "office/employment_agency": ("Employment Agency", "💼"),
+    "office/energy_supplier": ("Energy Supplier", "⚡"),
+    "office/engineer": ("Engineering Firm", "⚙️"),
+    "office/estate_agent": ("Estate Agent", "🏠"),
+    "office/financial": ("Financial Services", "💰"),
+    "office/financial_advisor": ("Financial Advisor", "📈"),
+    "office/foundation": ("Foundation", "🏛️"),
+    "office/government": ("Government Office", "🏛️"),
+    "office/graphic_design": ("Graphic Design", "🖌️"),
+    "office/healthcare": ("Healthcare Office", "🏥"),
+    "office/home_care": ("Home Care Service", "🏠"),
+    "office/insurance": ("Insurance", "🛡️"),
+    "office/interior_design": ("Interior Design", "🛋️"),
+    "office/it": ("IT Company", "💻"),
+    "office/lawyer": ("Lawyer", "⚖️"),
+    "office/logistics": ("Logistics", "🚚"),
+    "office/marketing": ("Marketing Agency", "📣"),
+    "office/mortgage": ("Mortgage Broker", "🏦"),
+    "office/moving_company": ("Moving Company", "📦"),
+    "office/newspaper": ("Newspaper Office", "📰"),
+    "office/ngo": ("NGO", "🌍"),
+    "office/notary": ("Notary", "📜"),
+    "office/political_party": ("Political Party", "🗳️"),
+    "office/politician": ("Politician Office", "🏛️"),
+    "office/property_management": ("Property Management", "🏘️"),
+    "office/recruitment": ("Recruitment Agency", "👥"),
+    "office/religion": ("Religious Office", "✝️"),
+    "office/research": ("Research Office", "🔬"),
+    "office/security": ("Security Company", "🔒"),
+    "office/solicitor": ("Solicitor", "⚖️"),
+    "office/surveyor": ("Surveyor", "📏"),
+    "office/tax_advisor": ("Tax Advisor", "🧾"),
+    "office/taxi": ("Taxi Office", "🚕"),
+    "office/telecommunication": ("Telecoms Office", "📡"),
+    "office/therapist": ("Therapist", "🧠"),
+    "office/travel_agent": ("Travel Agent", "✈️"),
+    "office/union": ("Trade Union", "✊"),
+    "office/university": ("University Office", "🎓"),
+    "office/vacant": ("Vacant Office", "🏚️"),
+    "office/web_design": ("Web Design", "🌐"),
+    # public_transport
+    "public_transport/entrance": ("Transport Entrance", "🚪"),
+    "public_transport/platform": ("Platform", "🚉"),
+    "public_transport/station": ("Station", "🚉"),
+    "public_transport/stop_position": ("Stop", "🚏"),
+    # shop
+    "shop/accessories": ("Accessories Shop", "👜"),
+    "shop/agrarian": ("Farm Supply Shop", "🌾"),
+    "shop/alcohol": ("Off-Licence", "🍷"),
+    "shop/antiques": ("Antiques Shop", "🏺"),
+    "shop/appliance": ("Appliance Shop", "🔌"),
+    "shop/art": ("Art Shop", "🎨"),
+    "shop/baby_goods": ("Baby Shop", "🍼"),
+    "shop/bag": ("Bag Shop", "👜"),
+    "shop/bakery": ("Bakery", "🥐"),
+    "shop/bathroom": ("Bathroom Shop", "🛁"),
+    "shop/bathroom_furnishing": ("Bathroom Furnishings", "🚿"),
+    "shop/beauty": ("Beauty Shop", "💄"),
+    "shop/bed": ("Bed Shop", "🛏️"),
+    "shop/beverages": ("Drinks Shop", "🥤"),
+    "shop/bicycle": ("Bike Shop", "🚲"),
+    "shop/boat": ("Boat Shop", "⛵"),
+    "shop/bookmaker": ("Bookmaker", "🏇"),
+    "shop/books": ("Bookshop", "📚"),
+    "shop/boutique": ("Boutique", "👗"),
+    "shop/building_materials": ("Building Materials", "🧱"),
+    "shop/butcher": ("Butcher", "🥩"),
+    "shop/camera": ("Camera Shop", "📷"),
+    "shop/candles": ("Candle Shop", "🕯️"),
+    "shop/car": ("Car Dealership", "🚗"),
+    "shop/car;car_repair": ("Car Sales & Repair", "🚗"),
+    "shop/car_parts": ("Car Parts", "🔩"),
+    "shop/car_repair": ("Car Repair", "🔧"),
+    "shop/caravan": ("Caravan Dealer", "🚐"),
+    "shop/carpet": ("Carpet Shop", "🧶"),
+    "shop/catalogue": ("Catalogue Shop", "📋"),
+    "shop/charity": ("Charity Shop", "❤️"),
+    "shop/cheese": ("Cheese Shop", "🧀"),
+    "shop/chemist": ("Chemist", "🧪"),
+    "shop/chocolate": ("Chocolate Shop", "🍫"),
+    "shop/clothes": ("Clothes Shop", "👕"),
+    "shop/coffee": ("Coffee Shop", "☕"),
+    "shop/collector": ("Collector Shop", "🏆"),
+    "shop/computer": ("Computer Shop", "🖥️"),
+    "shop/confectionery": ("Sweet Shop", "🍬"),
+    "shop/convenience": ("Convenience Store", "🏪"),
+    "shop/copyshop": ("Copy Shop", "🖨️"),
+    "shop/cosmetics": ("Cosmetics Shop", "💅"),
+    "shop/country_store": ("Country Store", "🏡"),
+    "shop/craft": ("Craft Shop", "✂️"),
+    "shop/curtain": ("Curtain Shop", "🪟"),
+    "shop/dairy": ("Dairy Shop", "🥛"),
+    "shop/deli": ("Delicatessen", "🧆"),
+    "shop/department_store": ("Department Store", "🏬"),
+    "shop/discount": ("Discount Store", "💲"),
+    "shop/doityourself": ("DIY Store", "🔨"),
+    "shop/doors": ("Door Shop", "🚪"),
+    "shop/dry_cleaning": ("Dry Cleaner", "👔"),
+    "shop/e-cigarette": ("Vape Shop", "💨"),
+    "shop/electrical": ("Electrical Shop", "⚡"),
+    "shop/electronics": ("Electronics Shop", "📱"),
+    "shop/erotic": ("Adult Shop", "🔞"),
+    "shop/esoteric": ("Esoteric Shop", "🔮"),
+    "shop/estate_agent": ("Estate Agent", "🏠"),
+    "shop/fabric": ("Fabric Shop", "🧵"),
+    "shop/fan": ("Fan Shop", "🏅"),
+    "shop/farm": ("Farm Shop", "🥕"),
+    "shop/fashion_accessories": ("Fashion Accessories", "👒"),
+    "shop/fireplace": ("Fireplace Shop", "🔥"),
+    "shop/fishing": ("Fishing Shop", "🎣"),
+    "shop/flooring": ("Flooring Shop", "🪵"),
+    "shop/florist": ("Florist", "💐"),
+    "shop/food": ("Food Shop", "🍞"),
+    "shop/frame": ("Framing Shop", "🖼️"),
+    "shop/frozen_food": ("Frozen Food Shop", "🧊"),
+    "shop/fuel": ("Fuel Shop", "⛽"),
+    "shop/funeral_directors": ("Funeral Director", "⚰️"),
+    "shop/furniture": ("Furniture Shop", "🪑"),
+    "shop/games": ("Games Shop", "🎮"),
+    "shop/garden_centre": ("Garden Centre", "🌻"),
+    "shop/gas": ("Gas Shop", "🔥"),
+    "shop/general": ("General Store", "🏪"),
+    "shop/gift": ("Gift Shop", "🎁"),
+    "shop/glaziery": ("Glazier", "🪟"),
+    "shop/greengrocer": ("Greengrocer", "🥬"),
+    "shop/grocery": ("Grocery Shop", "🛒"),
+    "shop/haberdashery": ("Haberdashery", "🧵"),
+    "shop/hairdresser": ("Hairdresser", "💇"),
+    "shop/hairdresser_supply": ("Hairdresser Supply", "💇"),
+    "shop/hardware": ("Hardware Shop", "🔩"),
+    "shop/health": ("Health Shop", "🌿"),
+    "shop/health_food": ("Health Food Shop", "🥗"),
+    "shop/hearing_aids": ("Hearing Aid Shop", "👂"),
+    "shop/herbalist": ("Herbalist", "🌿"),
+    "shop/hifi": ("Hi-Fi Shop", "🔊"),
+    "shop/household": ("Household Shop", "🏠"),
+    "shop/household_linen": ("Linen Shop", "🛏️"),
+    "shop/houseware": ("Houseware Shop", "🍳"),
+    "shop/ice_cream": ("Ice Cream Shop", "🍦"),
+    "shop/interior_decoration": ("Interior Decoration", "🖼️"),
+    "shop/jewelry": ("Jewellery Shop", "💍"),
+    "shop/kiosk": ("Kiosk", "🏪"),
+    "shop/kitchen": ("Kitchen Shop", "🍳"),
+    "shop/laundry": ("Laundry", "🧺"),
+    "shop/leather": ("Leather Shop", "🧳"),
+    "shop/lighting": ("Lighting Shop", "💡"),
+    "shop/locksmith": ("Locksmith", "🔐"),
+    "shop/mall": ("Shopping Centre", "🏬"),
+    "shop/massage": ("Massage Parlour", "💆"),
+    "shop/medical_supply": ("Medical Supply", "🩺"),
+    "shop/military_surplus": ("Military Surplus", "🎖️"),
+    "shop/mobile_phone": ("Mobile Phone Shop", "📱"),
+    "shop/mobile_phone_accessories": ("Phone Accessories", "📱"),
+    "shop/mobility": ("Mobility Shop", "♿"),
+    "shop/mobility_scooter": ("Mobility Scooter Shop", "🦽"),
+    "shop/model": ("Model Shop", "✈️"),
+    "shop/money_lender": ("Money Lender", "💰"),
+    "shop/motorcycle": ("Motorcycle Shop", "🏍️"),
+    "shop/motorcycle_repair": ("Motorcycle Repair", "🔧"),
+    "shop/music": ("Music Shop", "🎵"),
+    "shop/musical_instrument": ("Musical Instrument Shop", "🎸"),
+    "shop/newsagent": ("Newsagent", "📰"),
+    "shop/nutrition_supplements": ("Nutrition Shop", "💪"),
+    "shop/optician": ("Optician", "👓"),
+    "shop/outdoor": ("Outdoor Shop", "🏕️"),
+    "shop/outpost": ("Outpost", "📦"),
+    "shop/paint": ("Paint Shop", "🎨"),
+    "shop/party": ("Party Shop", "🎈"),
+    "shop/pastry": ("Pastry Shop", "🥐"),
+    "shop/pawnbroker": ("Pawnbroker", "💰"),
+    "shop/perfumery": ("Perfumery", "🌸"),
+    "shop/pet": ("Pet Shop", "🐾"),
+    "shop/pet_grooming": ("Pet Grooming", "🐩"),
+    "shop/photo": ("Photo Shop", "📸"),
+    "shop/piercing": ("Piercing Studio", "💎"),
+    "shop/plant_hire": ("Plant Hire", "🚜"),
+    "shop/pottery": ("Pottery Shop", "🏺"),
+    "shop/printer_ink": ("Ink & Toner Shop", "🖨️"),
+    "shop/printing": ("Print Shop", "🖨️"),
+    "shop/psychic": ("Psychic", "🔮"),
+    "shop/pyrotechnics": ("Fireworks Shop", "🎆"),
+    "shop/religion": ("Religious Shop", "✝️"),
+    "shop/rental": ("Rental Shop", "🔑"),
+    "shop/repair": ("Repair Shop", "🔧"),
+    "shop/scuba_diving": ("Scuba Diving Shop", "🤿"),
+    "shop/seafood": ("Fishmonger", "🐟"),
+    "shop/second_hand": ("Second-Hand Shop", "♻️"),
+    "shop/security": ("Security Shop", "🔒"),
+    "shop/sewing": ("Sewing Shop", "🪡"),
+    "shop/shoe_repair": ("Shoe Repair", "👞"),
+    "shop/shoes": ("Shoe Shop", "👟"),
+    "shop/sports": ("Sports Shop", "⚽"),
+    "shop/stationery": ("Stationery Shop", "✏️"),
+    "shop/storage_rental": ("Self Storage", "📦"),
+    "shop/supermarket": ("Supermarket", "🛒"),
+    "shop/swimming_pool": ("Pool Supplies", "🏊"),
+    "shop/tailor": ("Tailor", "🧵"),
+    "shop/tattoo": ("Tattoo Studio", "🖋️"),
+    "shop/taxi": ("Taxi Booking", "🚕"),
+    "shop/tea": ("Tea Shop", "🫖"),
+    "shop/telecommunication": ("Telecoms Shop", "📡"),
+    "shop/ticket": ("Ticket Office", "🎫"),
+    "shop/tiles": ("Tile Shop", "🔲"),
+    "shop/tobacco": ("Tobacconist", "🚬"),
+    "shop/tool_hire": ("Tool Hire", "🧰"),
+    "shop/toys": ("Toy Shop", "🧸"),
+    "shop/trade": ("Trade Supplier", "🏭"),
+    "shop/travel_agency": ("Travel Agency", "✈️"),
+    "shop/trophy": ("Trophy Shop", "🏆"),
+    "shop/tyres": ("Tyre Shop", "🛞"),
+    "shop/vacant": ("Vacant Shop", "🏚️"),
+    "shop/variety_store": ("Variety Store", "🏪"),
+    "shop/video": ("Video Shop", "📀"),
+    "shop/video_games": ("Video Game Shop", "🎮"),
+    "shop/watches": ("Watch Shop", "⌚"),
+    "shop/water_sports": ("Water Sports Shop", "🏄"),
+    "shop/weapons": ("Weapons Shop", "🗡️"),
+    "shop/wedding": ("Wedding Shop", "💒"),
+    "shop/wholesale": ("Wholesaler", "📦"),
+    "shop/wigs": ("Wig Shop", "💇"),
+    "shop/window_blind": ("Blinds Shop", "🪟"),
+    "shop/windows": ("Window Shop", "🪟"),
+    "shop/wine": ("Wine Shop", "🍷"),
+    "shop/wool": ("Wool Shop", "🧶"),
+    "shop/yes": ("Shop", "🛍️"),
+    # tourism
+    "tourism/artwork": ("Public Artwork", "🎨"),
+    "tourism/attraction": ("Tourist Attraction", "📸"),
+    "tourism/camp_site": ("Campsite", "⛺"),
+    "tourism/caravan_site": ("Caravan Site", "🚐"),
+    "tourism/chalet": ("Chalet", "🏔️"),
+    "tourism/gallery": ("Gallery", "🖼️"),
+    "tourism/guest_house": ("Guest House", "🏡"),
+    "tourism/hostel": ("Hostel", "🛏️"),
+    "tourism/hotel": ("Hotel", "🏨"),
+    "tourism/motel": ("Motel", "🏨"),
+    "tourism/museum": ("Museum", "🏛️"),
+    "tourism/picnic_site": ("Picnic Site", "🧺"),
+    "tourism/preserved_railway": ("Heritage Railway", "🚂"),
+    "tourism/theme_park": ("Theme Park", "🎢"),
+    "tourism/viewpoint": ("Viewpoint", "🔭"),
+    "tourism/zoo": ("Zoo", "🦁"),
+}
+
+
+def transform(input_path: Path) -> pl.LazyFrame:
+    lf = pl.scan_parquet(input_path)
+
+    # Get all unique categories present in the data
+    all_categories = lf.select("category").unique().collect().to_series().to_list()
+
+    # Verify every non-dropped category has a mapping
+    unmapped = []
+    for cat in all_categories:
+        if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
+            unmapped.append(cat)
+    if unmapped:
+        raise ValueError(
+            f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}"
+        )
+
+    # Verify every CATEGORY_MAP key actually exists in the data (catch typos)
+    mapped_but_absent = []
+    all_set = set(all_categories)
+    for cat in CATEGORY_MAP:
+        if cat not in all_set:
+            mapped_but_absent.append(cat)
+    if mapped_but_absent:
+        raise ValueError(
+            f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
+        )
+
+    # Drop unwanted categories
+    lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
+
+    # Build name and emoji lookup expressions
+    name_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
+    emoji_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
+
+    # Check no friendly names are missing (defensive)
+    missing_names = [k for k, v in CATEGORY_MAP.items() if not v[0]]
+    if missing_names:
+        raise ValueError(f"Empty friendly names for: {missing_names}")
+    missing_emojis = [k for k, v in CATEGORY_MAP.items() if not v[1]]
+    if missing_emojis:
+        raise ValueError(f"Empty emojis for: {missing_emojis}")
+
+    lf = lf.with_columns(
+        pl.col("category").replace_strict(name_mapping).alias("category"),
+        pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
+    )
+
+    return lf
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Transform raw POIs to filtered version with friendly names")
+    parser.add_argument("--input", type=Path, required=True, help="Raw POIs parquet file")
+    parser.add_argument("--output", type=Path, required=True, help="Output filtered POIs parquet file")
+    args = parser.parse_args()
+
+    df = transform(args.input).collect()
+
+    df.write_parquet(args.output)
+
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(f"Wrote {args.output} ({size_mb:.1f} MB, {len(df):,} POIs)")
+    print(f"\nCategories ({df['category'].n_unique()}):")
+    counts = df.group_by("category", "emoji").len().sort("len", descending=True)
+    for row in counts.iter_rows(named=True):
+        print(f"  {row['emoji']} {row['category']}: {row['len']:,}")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/utils/test_fuzzy_join.py
+++ b/pipeline/utils/test_fuzzy_join.py
@ -6,7 +6,7 @@ POSTCODE = "E14 2DG"

 # Price paid: unique addresses for this postcode
 pp = (
-    pl.scan_parquet("data_sources/pp-complete.parquet")
+    pl.scan_parquet("data/price-paid-complete.parquet")
    .filter(pl.col("postcode") == POSTCODE)
    .select("paon", "saon", "street", "postcode")
    .unique()
@ -22,7 +22,7 @@ pp = (

 # EPC: latest inspection per address for this postcode
 epc = (
-    pl.scan_csv("data_sources/epc/certificates.csv")
+    pl.scan_csv("data/epc/certificates.csv")
    .select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
    .filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
    .sort("INSPECTION_DATE", descending=True)
--- a/pipeline/wide.py
+++ b/pipeline/wide.py
@ -1,143 +0,0 @@
-"""Build a wide property dataframe and H3 aggregates from epc_pp output."""
-
-import polars as pl
-import h3
-
-from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, DATA_DIR, PROCESSED_DIR
-
-
-def _build_wide() -> pl.DataFrame:
-    """Build the wide dataframe by joining epc_pp with all auxiliary data."""
-    print("Loading epc_pp...")
-    wide = pl.read_parquet(PROCESSED_DIR / "epc_pp.parquet")
-    print(f"  {wide.shape[0]:,} rows")
-
-    # GPS coordinates + LSOA from ArcGIS
-    print("Joining GPS coordinates...")
-    arcgis = pl.read_parquet(DATA_DIR / "arcgis_data.parquet").select(
-        pl.col("pcds").alias("postcode"),
-        "lat",
-        pl.col("long").alias("lon"),
-        "lsoa21",
-    )
-    wide = wide.join(arcgis, on="postcode", how="inner")
-    print(f"  {wide.shape[0]:,} rows after GPS join")
-
-    # Journey times (optional)
-    journey_path = PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
-    if journey_path.exists():
-        print("Joining journey times...")
-        journey_times = pl.read_parquet(journey_path).select(
-            "postcode",
-            "public_transport_easy_minutes",
-            "public_transport_quick_minutes",
-            "cycling_minutes",
-        )
-        wide = wide.join(journey_times, on="postcode", how="left")
-
-    # Index of Deprivation
-    iod_path = DATA_DIR / "IoD2025_Scores.parquet"
-    if iod_path.exists():
-        print("Joining IoD scores...")
-        iod = pl.read_parquet(iod_path).drop(
-            "LSOA name (2021)",
-            "Local Authority District code (2024)",
-            "Local Authority District name (2024)",
-        )
-        # Rename IoD columns to clean snake_case
-        iod = iod.rename(_IOD_RENAMES)
-        wide = wide.join(
-            iod, left_on="lsoa21", right_on="lsoa_code", how="left"
-        )
-
-    return wide
-
-
-_IOD_RENAMES = {
-    "LSOA code (2021)": "lsoa_code",
-    "Index of Multiple Deprivation (IMD) Score": "imd_score",
-    "Income Score (rate)": "income_score",
-    "Employment Score (rate)": "employment_score",
-    "Education, Skills and Training Score": "education_score",
-    "Health Deprivation and Disability Score": "health_score",
-    "Crime Score": "crime_score",
-    "Barriers to Housing and Services Score": "housing_barriers_score",
-    "Living Environment Score": "living_environment_score",
-    "Income Deprivation Affecting Children Index (IDACI) Score (rate)": "idaci_score",
-    "Income Deprivation Affecting Older People (IDAOPI) Score (rate)": "idaopi_score",
-    "Children and Young People Sub-domain Score": "children_young_people_score",
-    "Adult Skills Sub-domain Score": "adult_skills_score",
-    "Geographical Barriers Sub-domain Score": "geographical_barriers_score",
-    "Wider Barriers Sub-domain Score": "wider_barriers_score",
-    "Indoors Sub-domain Score": "indoors_score",
-    "Outdoors Sub-domain Score": "outdoors_score",
-}
-
-
-def _add_h3_indices(df: pl.DataFrame) -> pl.DataFrame:
-    """Compute H3 indices from lat/lon for all configured resolutions."""
-    print("Computing H3 indices...")
-    # Compute per unique postcode for efficiency, then join back
-    postcodes = df.select("postcode", "lat", "lon").unique(subset=["postcode"])
-
-    for res in H3_RESOLUTIONS:
-        col_name = f"h3_res{res}"
-        postcodes = postcodes.with_columns(
-            pl.struct(["lat", "lon"])
-            .map_elements(
-                lambda x, r=res: h3.latlng_to_cell(x["lat"], x["lon"], r),
-                return_dtype=pl.Utf8,
-            )
-            .alias(col_name)
-        )
-        print(f"  res{res}: {postcodes[col_name].n_unique():,} unique cells")
-
-    h3_cols = [f"h3_res{res}" for res in H3_RESOLUTIONS]
-    return df.join(
-        postcodes.select("postcode", *h3_cols), on="postcode", how="left"
-    )
-
-
-def _aggregate_to_h3(df: pl.DataFrame) -> None:
-    """Aggregate min/max of every numeric feature per H3 cell at each resolution."""
-    AGGREGATES_DIR.mkdir(parents=True, exist_ok=True)
-
-    exclude = {"lat", "lon"}
-    numeric_cols = [
-        col
-        for col, dtype in zip(df.columns, df.dtypes)
-        if dtype.is_numeric() and not col.startswith("h3_res") and col not in exclude
-    ]
-
-    agg_exprs = [pl.len().alias("count")]
-    for col in numeric_cols:
-        agg_exprs.append(pl.col(col).min().alias(f"min_{col}"))
-        agg_exprs.append(pl.col(col).max().alias(f"max_{col}"))
-
-    print("Aggregating to H3 cells...")
-    for res in H3_RESOLUTIONS:
-        h3_col = f"h3_res{res}"
-        result = df.group_by(h3_col).agg(agg_exprs).rename({h3_col: "h3"})
-        path = AGGREGATES_DIR / f"res{res}.parquet"
-        result.write_parquet(path)
-        size_mb = path.stat().st_size / (1024 * 1024)
-        print(f"  {path.name}: {result.shape[0]:,} cells ({size_mb:.1f} MB)")
-
-
-def run():
-    """Run the full wide pipeline: build wide df, compute H3, aggregate."""
-    wide = _build_wide()
-
-    wide_path = PROCESSED_DIR / "wide.parquet"
-    wide.write_parquet(wide_path)
-    size_mb = wide_path.stat().st_size / (1024 * 1024)
-    print(f"Wrote {wide_path} ({size_mb:.1f} MB)")
-
-    wide = _add_h3_indices(wide)
-    _aggregate_to_h3(wide)
-
-    print("Done.")
-
-
-if __name__ == "__main__":
-    run()