Update H3 pipeline

This commit is contained in:
Andras Schmelczer 2026-01-30 18:33:48 +00:00
parent 68b6dcf65e
commit 6122ee44da
13 changed files with 291 additions and 420 deletions

View file

@ -10,13 +10,3 @@ AGGREGATES_DIR = PROCESSED_DIR / "aggregates"
# https://h3geo.org/docs/core-library/restable/#average-area-in-m2
H3_RESOLUTIONS = [7, 8, 9, 10, 11]
DEFAULT_H3_RESOLUTION = 8
# Year filters
MIN_YEAR = 1995
MAX_YEAR = 2024
DEFAULT_MIN_YEAR = 2020
DEFAULT_MAX_YEAR = 2024
# Price filters
DEFAULT_MIN_PRICE = 0
DEFAULT_MAX_PRICE = 100_000_000

View file

@ -18,7 +18,7 @@ epc = pl.scan_csv('data_sources/epc/certificates.csv').select(
'NUMBER_HABITABLE_ROOMS',
'FLOOR_HEIGHT',
'CONSTRUCTION_AGE_BAND'
).sort('INSPECTION_DATE', descending=True).group_by('epc_address').first()
).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
print("EPC dataset")
@ -39,7 +39,8 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
'locality',
'town_city',
pl.col('duration').replace(duration_map)
).filter(pl.col('pp_property_type') != 'Other').with_columns(
)
.filter(pl.col('pp_property_type') != 'Other').with_columns(
pl.concat_str(
[pl.col('saon'), pl.col('paon'), pl.col('street')],
separator=' ',
@ -58,30 +59,27 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
pl.col('price').last().alias('latest_price'),
pl.col('date_of_transfer').last(),
)
)
).filter(pl.col('pp_address').is_not_null())
print("Price paid dataset")
print(price_paid.head().collect())
price_paid_df = price_paid.collect()
epc_df = epc.collect()
joined = fuzzy_join_on_postcode(
left=price_paid_df,
right=epc_df,
left=price_paid,
right=epc,
left_address_col='pp_address',
right_address_col='epc_address',
left_postcode_col='postcode',
right_postcode_col='POSTCODE',
score_threshold=80,
).drop('POSTCODE')
).drop('POSTCODE').collect()
matched_count = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null()).height
print(f"Unique properties: {price_paid_df.height}")
print(f"Matched: {matched_count} ({100 * matched_count / price_paid_df.height:.1f}%)")
print(f"Unmatched: {price_paid_df.height - matched_count}")
matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
total = joined.height
print(f"Unique properties: {total}")
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
print(f"Unmatched: {total - matched.height}")
joined = joined.rename({col: col.lower() for col in joined.columns})
matched = matched.rename({col: col.lower() for col in joined.columns})
print(joined.head())
joined.write_parquet('data_sources/processed/epc_pp.parquet')
print(matched.head())
matched.write_parquet('data_sources/processed/epc_pp.parquet')

View file

@ -1,6 +1,9 @@
import re
import shutil
import tempfile
from concurrent.futures import ProcessPoolExecutor
from os import cpu_count
from pathlib import Path
import polars as pl
from thefuzz import fuzz
@ -9,105 +12,143 @@ from tqdm import tqdm
_NUMBER_RE = re.compile(r'\d+')
def _normalize(s: pl.Expr) -> pl.Expr:
return (
s.str.to_uppercase()
.str.replace_all(r'[,.\-]', ' ')
.str.replace_all(r'\s+', ' ')
.str.strip_chars()
)
def fuzzy_join_on_postcode(
left: pl.DataFrame,
right: pl.DataFrame,
left: pl.LazyFrame,
right: pl.LazyFrame,
left_address_col: str,
right_address_col: str,
left_postcode_col: str,
right_postcode_col: str,
score_threshold: int = 80,
) -> pl.DataFrame:
"""Fuzzy join two DataFrames by matching addresses within postcode buckets.
) -> pl.LazyFrame:
"""Fuzzy join two LazyFrames by matching addresses within postcode buckets.
Returns the left DataFrame with all right columns appended.
Unmatched rows have null right columns.
Sinks each side to a temporary parquet file so the upstream pipeline
executes only once. The matching phase collects just three narrow
columns (index, address, postcode) via projection pushdown, and the
final join reads the remaining columns lazily.
Returns a LazyFrame with all left and right columns. Unmatched rows
have null right columns.
"""
def _normalize(s: pl.Expr) -> pl.Expr:
return (
s.str.to_uppercase()
.str.replace_all(r'[,.\-]', ' ')
.str.replace_all(r'\s+', ' ')
.str.strip_chars()
tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_')
left_path = Path(tmpdir) / 'left.parquet'
right_path = Path(tmpdir) / 'right.parquet'
try:
# Materialise each side exactly once, with a row index, to temp parquet.
left.with_row_index('_left_idx').sink_parquet(left_path)
right.with_row_index('_right_idx').sink_parquet(right_path)
# Collect only the narrow columns needed for matching (projection pushdown).
left_match = (
pl.scan_parquet(left_path)
.select(
'_left_idx',
_normalize(pl.col(left_address_col)).alias('_left_address'),
pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
)
.collect()
)
left = left.with_columns(
_normalize(pl.col(left_address_col)).alias('_left_address'),
pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'),
)
right = right.with_columns(
_normalize(pl.col(right_address_col)).alias('_right_address'),
pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
)
right_match = (
pl.scan_parquet(right_path)
.select(
'_right_idx',
_normalize(pl.col(right_address_col)).alias('_right_address'),
pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'),
)
.unique(subset=['_right_address', '_right_postcode'], keep='first')
.collect()
)
# Deduplicate right side on normalized address + postcode so that
# variant spellings of the same address don't consume multiple slots.
right = right.unique(subset=['_right_address', '_right_postcode'], keep='first')
# Group right side by postcode for fast lookup
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
for i, (postcode, address) in enumerate(
zip(right['_right_postcode'], right['_right_address'])
):
if postcode is not None:
right_by_postcode.setdefault(postcode, []).append((i, address))
# Group left side by postcode
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
for left_row, (postcode, address) in enumerate(
zip(left['_left_postcode'], left['_left_address'])
):
if address is not None and postcode is not None:
left_by_postcode.setdefault(postcode, []).append((left_row, address))
# Build tasks for each postcode bucket
tasks = [
(left_entries, right_by_postcode[postcode], score_threshold)
for postcode, left_entries in left_by_postcode.items()
if postcode in right_by_postcode
]
# Score all pairwise matches in parallel, then greedily assign from
# highest score downward so best pairs lock in first.
all_pairs: list[tuple[int, int, int]] = [] # (score, left_row, right_row)
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc='Fuzzy matching',
# Group right side by postcode for fast lookup
right_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip(
right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address']
):
all_pairs.extend(pairs)
if postcode is not None:
right_by_postcode.setdefault(postcode, []).append((idx, address))
# Sort descending by score so best matches are assigned first
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
# Group left side by postcode
left_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip(
left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address']
):
if address is not None and postcode is not None:
left_by_postcode.setdefault(postcode, []).append((idx, address))
match_indices: list[int | None] = [None] * len(left)
matched_left: set[int] = set()
matched_right: set[int] = set()
del left_match, right_match
for score, left_row, right_row in all_pairs:
if left_row in matched_left or right_row in matched_right:
continue
match_indices[left_row] = right_row
matched_left.add(left_row)
matched_right.add(right_row)
# Build tasks for each postcode bucket
tasks = [
(left_entries, right_by_postcode[postcode])
for postcode, left_entries in left_by_postcode.items()
if postcode in right_by_postcode
]
# Select right columns (excluding internal helpers)
right_cols = right.select(pl.exclude('_right_address', '_right_postcode'))
right_matched = right_cols[
[i if i is not None else 0 for i in match_indices]
]
# Score all pairwise matches in parallel, then greedily assign from
# highest score downward so best pairs lock in first.
all_pairs: list[tuple[int, int, int]] = [] # (score, left_idx, right_idx)
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks),
desc='Fuzzy matching',
):
all_pairs.extend(pairs)
# Null out unmatched rows
mask = pl.Series('_matched', [i is not None for i in match_indices])
right_matched = right_matched.with_columns(
pl.when(mask).then(pl.col(c)).otherwise(pl.lit(None)).alias(c)
for c in right_matched.columns
)
del tasks, left_by_postcode, right_by_postcode
left_clean = left.select(pl.exclude('_left_address', '_left_postcode'))
return pl.concat([left_clean, right_matched], how='horizontal')
# Sort descending by score so best matches are assigned first
all_pairs.sort(key=lambda t: (t[0], -t[1]), reverse=True)
matches: list[tuple[int, int]] = []
matched_left: set[int] = set()
matched_right: set[int] = set()
for _score, left_idx, right_idx in all_pairs:
if left_idx in matched_left or right_idx in matched_right:
continue
matches.append((left_idx, right_idx))
matched_left.add(left_idx)
matched_right.add(right_idx)
del all_pairs, matched_left, matched_right
# Build a small mapping LazyFrame and join back to the cached parquets.
if matches:
mapping = pl.LazyFrame({
'_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32),
'_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32),
})
else:
mapping = pl.LazyFrame({
'_left_idx': pl.Series([], dtype=pl.UInt32),
'_right_idx': pl.Series([], dtype=pl.UInt32),
})
left_cached = pl.scan_parquet(left_path)
right_cached = pl.scan_parquet(right_path)
return (
left_cached
.join(mapping, on='_left_idx', how='left')
.join(right_cached, on='_right_idx', how='left')
.drop('_left_idx', '_right_idx')
)
except BaseException:
shutil.rmtree(tmpdir, ignore_errors=True)
raise
def _numbers_compatible(a: str, b: str) -> bool:
@ -127,13 +168,12 @@ def _score_bucket(
args: tuple[list[tuple[int, str]], list[tuple[int, str]], int],
) -> list[tuple[int, int, int]]:
"""Score all address pairs within a single postcode bucket."""
left_entries, right_entries, score_threshold = args
left_entries, right_entries = args
pairs = []
for left_row, left_address in left_entries:
for right_row, right_address in right_entries:
if not _numbers_compatible(left_address, right_address):
continue
score = fuzz.token_sort_ratio(left_address, right_address)
if score >= score_threshold:
pairs.append((score, left_row, right_row))
pairs.append((score, left_row, right_row))
return pairs

View file

@ -1,42 +0,0 @@
from pathlib import Path
import polars as pl
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS
def aggregate(df: pl.LazyFrame, resolution: int) -> pl.LazyFrame:
"""Aggregate property data by H3 cell and year."""
h3_col = f"h3_res{resolution}"
return (
df.group_by(h3_col, "year")
.agg(
pl.len().alias("count"),
pl.col("price").mean().alias("avg_price"),
pl.col("price").median().alias("median_price"),
pl.col("price").min().alias("min_price"),
pl.col("price").max().alias("max_price"),
)
.rename({h3_col: "h3"})
)
def aggregate_all(df: pl.LazyFrame) -> dict[int, pl.LazyFrame]:
"""Aggregate at all H3 resolutions."""
return {res: aggregate(df, res) for res in H3_RESOLUTIONS}
def save_aggregates(df: pl.LazyFrame, output_dir: Path | None = None) -> list[Path]:
"""Aggregate and save at all H3 resolutions."""
output_dir = output_dir or AGGREGATES_DIR
output_dir.mkdir(parents=True, exist_ok=True)
saved_paths = []
aggregates = aggregate_all(df)
for res, agg_df in aggregates.items():
output_path = output_dir / f"res{res}.parquet"
agg_df.collect().write_parquet(output_path)
saved_paths.append(output_path)
return saved_paths

View file

@ -1,127 +0,0 @@
"""Aggregate journey times data by H3 hexagonal cells."""
from pathlib import Path
import polars as pl
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR
JOURNEY_COLS = [
"public_transport_easy_minutes",
"public_transport_quick_minutes",
"cycling_minutes",
]
AGGREGATE_COLS = [
"median_pt_easy_minutes",
"median_pt_quick_minutes",
"median_cycling_minutes",
"median_journey_minutes",
]
def aggregate_journey_times(
journey_times_path: Path | None = None,
postcodes_h3_path: Path | None = None,
aggregates_dir: Path | None = None,
) -> list[Path]:
"""
Add journey times to existing H3 aggregate parquet files.
Joins journey_times_bank_checkpoint.parquet with postcodes_h3.parquet on postcode,
aggregates by H3 cell, then merges into existing res{N}.parquet files.
"""
journey_times_path = (
journey_times_path
or PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
)
postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet"
aggregates_dir = aggregates_dir or AGGREGATES_DIR
# Load journey times data
journey_df = pl.read_parquet(journey_times_path).select(
["postcode"] + JOURNEY_COLS
)
# Filter out rows where all journey time columns are null
journey_df = journey_df.filter(
pl.any_horizontal(pl.col(c).is_not_null() for c in JOURNEY_COLS)
)
if journey_df.height == 0:
print("No valid journey times found")
return []
# Load postcodes with H3 indices
postcodes_df = pl.read_parquet(postcodes_h3_path)
# Join on postcode to get H3 indices
joined_df = journey_df.join(postcodes_df, on="postcode", how="inner")
if joined_df.height == 0:
print("No matching postcodes found")
return []
print(f"Joined {joined_df.height} postcodes with journey times")
updated_paths = []
for resolution in H3_RESOLUTIONS:
h3_col = f"h3_res{resolution}"
parquet_path = aggregates_dir / f"res{resolution}.parquet"
if not parquet_path.exists():
print(f"Skipping resolution {resolution} - {parquet_path} not found")
continue
if h3_col not in joined_df.columns:
print(f"Skipping resolution {resolution} - column {h3_col} not found")
continue
# Aggregate journey times by H3 cell
journey_agg = (
joined_df.group_by(h3_col)
.agg(
pl.col("public_transport_easy_minutes")
.median()
.alias("median_pt_easy_minutes"),
pl.col("public_transport_quick_minutes")
.median()
.alias("median_pt_quick_minutes"),
pl.col("cycling_minutes")
.median()
.alias("median_cycling_minutes"),
pl.col("public_transport_quick_minutes")
.median()
.alias("median_journey_minutes"),
)
.rename({h3_col: "h3"})
)
# Load existing parquet
existing_df = pl.read_parquet(parquet_path)
# Drop existing journey time columns if present
existing_df = existing_df.drop(
[c for c in AGGREGATE_COLS if c in existing_df.columns]
)
# Left join journey times onto existing data
updated_df = existing_df.join(journey_agg, on="h3", how="left")
# Save back to parquet
updated_df.write_parquet(parquet_path)
updated_paths.append(parquet_path)
matched = updated_df.filter(
pl.col("median_journey_minutes").is_not_null()
).height
print(
f"Updated {parquet_path.name}: {matched} rows with journey times "
f"(out of {updated_df.height} total)"
)
return updated_paths
if __name__ == "__main__":
aggregate_journey_times()

View file

@ -1,45 +1,6 @@
"""Pipeline CLI to process property data with H3 spatial indexing."""
import polars as pl
from pipeline.sources.postcodes import save_postcodes
from pipeline.sources.property_prices import PropertyPricesSource
from pipeline.processors.h3_aggregator import save_aggregates
from pipeline.processors.journey_times_aggregator import aggregate_journey_times
def run_pipeline():
"""Run the full data processing pipeline."""
print("=" * 60)
print("Property Map Data Pipeline")
print("=" * 60)
# Step 1: Process postcodes with H3 indices
print("\n[1/4] Processing postcodes with H3 indices...")
postcodes_path = save_postcodes()
print(f" Saved: {postcodes_path}")
print("\n[2/4] Processing property prices...")
postcodes = pl.scan_parquet(postcodes_path)
property_source = PropertyPricesSource()
properties = property_source.process(postcodes)
print(" Joined property prices with postcodes")
print("\n[3/4] Aggregating at H3 resolutions...")
saved_paths = save_aggregates(properties)
for path in saved_paths:
size_mb = path.stat().st_size / (1024 * 1024)
print(f" Saved: {path.name} ({size_mb:.1f} MB)")
print("\n[4/4] Adding journey times to aggregates...")
updated_paths = aggregate_journey_times()
if updated_paths:
for path in updated_paths:
size_mb = path.stat().st_size / (1024 * 1024)
print(f" Updated: {path.name} ({size_mb:.1f} MB)")
else:
print(" Skipped (no journey time data found)")
from pipeline.wide import run
if __name__ == "__main__":
run_pipeline()
run()

View file

@ -1,49 +0,0 @@
from pathlib import Path
import polars as pl
import h3
from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR
def lat_long_to_h3(lat: float, long: float, resolution: int) -> str:
"""Convert lat/long to H3 index at given resolution."""
return h3.latlng_to_cell(lat, long, resolution)
def load_postcodes() -> pl.LazyFrame:
"""Load postcode data from arcgis parquet file."""
return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select(
pl.col("pcds").alias("postcode"),
pl.col("lat"),
pl.col("long"),
)
def process_postcodes() -> pl.LazyFrame:
"""Process postcodes and add H3 indices at multiple resolutions."""
df = load_postcodes().collect()
for res in H3_RESOLUTIONS:
col_name = f"h3_res{res}"
df = df.with_columns(
pl.struct(["lat", "long"])
.map_elements(
# Capture res by value using default argument to avoid closure bug
lambda x, res=res: lat_long_to_h3(x["lat"], x["long"], res),
return_dtype=pl.Utf8,
)
.alias(col_name)
)
return df.lazy()
def save_postcodes(output_path: Path | None = None) -> Path:
"""Process and save postcodes with H3 indices."""
output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet"
output_path.parent.mkdir(parents=True, exist_ok=True)
df = process_postcodes().collect()
df.write_parquet(output_path)
return output_path

View file

@ -1,41 +0,0 @@
import polars as pl
from pipeline.base import DataSource
from pipeline.config import DATA_DIR, H3_RESOLUTIONS
class PropertyPricesSource(DataSource):
"""Land Registry property prices data source."""
@property
def name(self) -> str:
return "property_prices"
def load(self) -> pl.LazyFrame:
"""Load raw property prices data."""
return pl.scan_parquet(DATA_DIR / "pp-complete.parquet")
def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
"""Process and join with postcode coordinates and H3 indices."""
prices = self.load().select(
pl.col("price"),
pl.col("date_of_transfer").dt.year().alias("year"),
pl.col("property_type"),
pl.col("postcode"),
)
joined = prices.join(
postcodes,
on="postcode",
how="inner",
)
h3_cols = [pl.col(f"h3_res{res}") for res in H3_RESOLUTIONS]
return joined.select(
pl.col("price"),
pl.col("year"),
pl.col("property_type"),
pl.col("lat"),
pl.col("long"),
*h3_cols,
)

View file

@ -9,7 +9,6 @@ pp = (
pl.scan_parquet("data_sources/pp-complete.parquet")
.filter(pl.col("postcode") == POSTCODE)
.select("paon", "saon", "street", "postcode")
.collect()
.unique()
.sort("saon")
.with_columns(
@ -27,14 +26,10 @@ epc = (
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
.sort("INSPECTION_DATE", descending=True)
.collect()
.unique("ADDRESS")
.sort("ADDRESS")
)
print(f"Price paid: {len(pp)} unique addresses")
print(f"EPC: {len(epc)} unique addresses")
result = fuzzy_join_on_postcode(
left=pp,
right=epc,
@ -42,9 +37,7 @@ result = fuzzy_join_on_postcode(
right_address_col="ADDRESS",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
score_threshold=80,
)
).collect()
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")

143
pipeline/wide.py Normal file
View file

@ -0,0 +1,143 @@
"""Build a wide property dataframe and H3 aggregates from epc_pp output."""
import polars as pl
import h3
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, DATA_DIR, PROCESSED_DIR
def _build_wide() -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
print("Loading epc_pp...")
wide = pl.read_parquet(PROCESSED_DIR / "epc_pp.parquet")
print(f" {wide.shape[0]:,} rows")
# GPS coordinates + LSOA from ArcGIS
print("Joining GPS coordinates...")
arcgis = pl.read_parquet(DATA_DIR / "arcgis_data.parquet").select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
"lsoa21",
)
wide = wide.join(arcgis, on="postcode", how="inner")
print(f" {wide.shape[0]:,} rows after GPS join")
# Journey times (optional)
journey_path = PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
if journey_path.exists():
print("Joining journey times...")
journey_times = pl.read_parquet(journey_path).select(
"postcode",
"public_transport_easy_minutes",
"public_transport_quick_minutes",
"cycling_minutes",
)
wide = wide.join(journey_times, on="postcode", how="left")
# Index of Deprivation
iod_path = DATA_DIR / "IoD2025_Scores.parquet"
if iod_path.exists():
print("Joining IoD scores...")
iod = pl.read_parquet(iod_path).drop(
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
)
# Rename IoD columns to clean snake_case
iod = iod.rename(_IOD_RENAMES)
wide = wide.join(
iod, left_on="lsoa21", right_on="lsoa_code", how="left"
)
return wide
_IOD_RENAMES = {
"LSOA code (2021)": "lsoa_code",
"Index of Multiple Deprivation (IMD) Score": "imd_score",
"Income Score (rate)": "income_score",
"Employment Score (rate)": "employment_score",
"Education, Skills and Training Score": "education_score",
"Health Deprivation and Disability Score": "health_score",
"Crime Score": "crime_score",
"Barriers to Housing and Services Score": "housing_barriers_score",
"Living Environment Score": "living_environment_score",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)": "idaci_score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)": "idaopi_score",
"Children and Young People Sub-domain Score": "children_young_people_score",
"Adult Skills Sub-domain Score": "adult_skills_score",
"Geographical Barriers Sub-domain Score": "geographical_barriers_score",
"Wider Barriers Sub-domain Score": "wider_barriers_score",
"Indoors Sub-domain Score": "indoors_score",
"Outdoors Sub-domain Score": "outdoors_score",
}
def _add_h3_indices(df: pl.DataFrame) -> pl.DataFrame:
"""Compute H3 indices from lat/lon for all configured resolutions."""
print("Computing H3 indices...")
# Compute per unique postcode for efficiency, then join back
postcodes = df.select("postcode", "lat", "lon").unique(subset=["postcode"])
for res in H3_RESOLUTIONS:
col_name = f"h3_res{res}"
postcodes = postcodes.with_columns(
pl.struct(["lat", "lon"])
.map_elements(
lambda x, r=res: h3.latlng_to_cell(x["lat"], x["lon"], r),
return_dtype=pl.Utf8,
)
.alias(col_name)
)
print(f" res{res}: {postcodes[col_name].n_unique():,} unique cells")
h3_cols = [f"h3_res{res}" for res in H3_RESOLUTIONS]
return df.join(
postcodes.select("postcode", *h3_cols), on="postcode", how="left"
)
def _aggregate_to_h3(df: pl.DataFrame) -> None:
"""Aggregate min/max of every numeric feature per H3 cell at each resolution."""
AGGREGATES_DIR.mkdir(parents=True, exist_ok=True)
exclude = {"lat", "lon"}
numeric_cols = [
col
for col, dtype in zip(df.columns, df.dtypes)
if dtype.is_numeric() and not col.startswith("h3_res") and col not in exclude
]
agg_exprs = [pl.len().alias("count")]
for col in numeric_cols:
agg_exprs.append(pl.col(col).min().alias(f"min_{col}"))
agg_exprs.append(pl.col(col).max().alias(f"max_{col}"))
print("Aggregating to H3 cells...")
for res in H3_RESOLUTIONS:
h3_col = f"h3_res{res}"
result = df.group_by(h3_col).agg(agg_exprs).rename({h3_col: "h3"})
path = AGGREGATES_DIR / f"res{res}.parquet"
result.write_parquet(path)
size_mb = path.stat().st_size / (1024 * 1024)
print(f" {path.name}: {result.shape[0]:,} cells ({size_mb:.1f} MB)")
def run():
"""Run the full wide pipeline: build wide df, compute H3, aggregate."""
wide = _build_wide()
wide_path = PROCESSED_DIR / "wide.parquet"
wide.write_parquet(wide_path)
size_mb = wide_path.stat().st_size / (1024 * 1024)
print(f"Wrote {wide_path} ({size_mb:.1f} MB)")
wide = _add_h3_indices(wide)
_aggregate_to_h3(wide)
print("Done.")
if __name__ == "__main__":
run()