Format python

This commit is contained in:
Andras Schmelczer 2026-01-31 13:07:09 +00:00
parent 85f5770e09
commit 4c258018c3
17 changed files with 348 additions and 248 deletions

View file

@ -21,47 +21,43 @@ tasks:
- cd frontend && npm install - cd frontend && npm install
download:arcgis: download:arcgis:
internal: true
desc: Download and convert ArcGIS postcode data desc: Download and convert ArcGIS postcode data
sources:
- pipeline/download/arcgis.py
generates: generates:
- "{{.ARCGIS_OUTPUT}}" - "{{.ARCGIS_OUTPUT}}"
cmds: cmds:
- uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}} - uv run python -m pipeline.download.arcgis --output {{.ARCGIS_OUTPUT}}
download:price-paid: download:price-paid:
internal: true
desc: Download and convert Land Registry price-paid data desc: Download and convert Land Registry price-paid data
sources:
- pipeline/download/price_paid.py
generates: generates:
- "{{.PRICE_PAID_OUTPUT}}" - "{{.PRICE_PAID_OUTPUT}}"
cmds: cmds:
- uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}} - uv run python -m pipeline.download.price_paid --output {{.PRICE_PAID_OUTPUT}}
download:deprivation: download:deprivation:
internal: true
desc: Download and convert Index of Deprivation data desc: Download and convert Index of Deprivation data
sources:
- pipeline/download/deprivation_data.py
generates: generates:
- "{{.IOD_OUTPUT}}" - "{{.IOD_OUTPUT}}"
cmds: cmds:
- uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}} - uv run python -m pipeline.download.deprivation_data --output {{.IOD_OUTPUT}}
download:pois: download:pois:
internal: true
desc: Download and extract POIs from OpenStreetMap desc: Download and extract POIs from OpenStreetMap
sources:
- pipeline/download/pois.py
generates: generates:
- "{{.POIS_RAW_OUTPUT}}" - "{{.POIS_RAW_OUTPUT}}"
cmds: cmds:
- uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}} - uv run python -m pipeline.download.pois --output {{.POIS_RAW_OUTPUT}}
transform:pois: transform:pois:
internal: true
desc: Transform raw POIs to filtered version with friendly names desc: Transform raw POIs to filtered version with friendly names
deps: deps:
- download:pois - download:pois
sources: sources:
- pipeline/transform/transform_poi.py
- "{{.POIS_RAW_OUTPUT}}" - "{{.POIS_RAW_OUTPUT}}"
generates: generates:
- "{{.POIS_FILTERED_OUTPUT}}" - "{{.POIS_FILTERED_OUTPUT}}"
@ -69,12 +65,11 @@ tasks:
- uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}} - uv run python -m pipeline.transform.transform_poi --input {{.POIS_RAW_OUTPUT}} --output {{.POIS_FILTERED_OUTPUT}}
transform:epc-pp: transform:epc-pp:
internal: true
desc: Fuzzy join EPC and Price Paid data desc: Fuzzy join EPC and Price Paid data
deps: deps:
- download:price-paid - download:price-paid
sources: sources:
- pipeline/transform/join_epc_pp.py
- pipeline/utils/fuzzy_join.py
- "{{.PRICE_PAID_OUTPUT}}" - "{{.PRICE_PAID_OUTPUT}}"
- "{{.EPC_CSV}}" - "{{.EPC_CSV}}"
generates: generates:
@ -83,13 +78,12 @@ tasks:
- uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC_CSV}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}} - uv run python -m pipeline.transform.join_epc_pp --epc {{.EPC_CSV}} --price-paid {{.PRICE_PAID_OUTPUT}} --output {{.EPC_PP_OUTPUT}}
transform:poi-proximity: transform:poi-proximity:
internal: true
desc: Compute POI proximity counts per postcode desc: Compute POI proximity counts per postcode
deps: deps:
- download:arcgis - download:arcgis
- transform:pois - transform:pois
sources: sources:
- pipeline/transform/poi_proximity.py
- pipeline/utils/poi_counts.py
- "{{.ARCGIS_OUTPUT}}" - "{{.ARCGIS_OUTPUT}}"
- "{{.POIS_FILTERED_OUTPUT}}" - "{{.POIS_FILTERED_OUTPUT}}"
generates: generates:
@ -97,7 +91,7 @@ tasks:
cmds: cmds:
- uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}} - uv run python -m pipeline.transform.poi_proximity --arcgis {{.ARCGIS_OUTPUT}} --pois {{.POIS_FILTERED_OUTPUT}} --output {{.POI_PROXIMITY_OUTPUT}}
transform:wide: prepare:
desc: Build wide property dataframe with all joins desc: Build wide property dataframe with all joins
deps: deps:
- join:epc-pp - join:epc-pp
@ -105,7 +99,6 @@ tasks:
- download:deprivation - download:deprivation
- transform:poi-proximity - transform:poi-proximity
sources: sources:
- pipeline/transform/merge.py
- "{{.EPC_PP_OUTPUT}}" - "{{.EPC_PP_OUTPUT}}"
- "{{.ARCGIS_OUTPUT}}" - "{{.ARCGIS_OUTPUT}}"
- "{{.IOD_OUTPUT}}" - "{{.IOD_OUTPUT}}"
@ -115,36 +108,37 @@ tasks:
cmds: cmds:
- uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --output {{.WIDE_OUTPUT}} - uv run python -m pipeline.transform.merge --epc-pp {{.EPC_PP_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --iod {{.IOD_OUTPUT}} --poi-proximity {{.POI_PROXIMITY_OUTPUT}} --journey-times {{.JOURNEY_TIMES}} --output {{.WIDE_OUTPUT}}
prepare:
desc: Prepare the application (install, download data, run pipeline)
deps:
- transform:wide
test: test:
cmds: cmds:
- uv run -m pipeline.utils.test_fuzzy_join - uv run -m pipeline.utils.test_fuzzy_join
- uv run pytest pipeline/utils/test_haversine.py - uv run pytest pipeline/utils/test_haversine.py
- uv run pytest pipeline/utils/test_poi_counts.py - uv run pytest pipeline/utils/test_poi_counts.py
server: dev:server:
desc: Run Rust backend on port 8001 desc: Run Rust backend on port 8001
dir: server-rs dir: server-rs
cmds: cmds:
- cargo run --release -- {{.WIDE_OUTPUT}} - cargo run --release -- {{.WIDE_OUTPUT}}
frontend: dev:frontend:
desc: Run frontend dev server on port 3030 (proxies /api to :8001) desc: Run frontend dev server on port 3030 (proxies /api to :8001)
dir: frontend dir: frontend
cmds: cmds:
- npm run dev - npm run dev
build: build:server:
desc: Build server for production
dir: frontend
cmds:
- cargo build --release
build:frontend:
desc: Build frontend for production desc: Build frontend for production
dir: frontend dir: frontend
cmds: cmds:
- npm run typecheck
- npm run build - npm run build
lint: lint:
desc: Lint all code (Python, TypeScript, and Rust) desc: Lint all code (Python, TypeScript, and Rust)
cmds: cmds:
@ -195,17 +189,13 @@ tasks:
desc: Format Rust code with cargo fmt desc: Format Rust code with cargo fmt
dir: server-rs dir: server-rs
cmds: cmds:
- cargo fmt - cargo fmt --all
check: check:
desc: Run all checks (lint, typecheck, build) desc: Run all checks (lint, typecheck, build)
cmds: cmds:
- task: lint - task: lint
- task: typecheck - task: build:server
- task: build - task: build:frontend
- task: test
typecheck:
desc: Type check frontend TypeScript code
dir: frontend
cmds:
- npm run typecheck

View file

@ -35,7 +35,6 @@ def download_with_progress(url: str, output_path: Path) -> None:
return return
def extract_zip(zip_path: Path, extract_path: Path) -> None: def extract_zip(zip_path: Path, extract_path: Path) -> None:
extract_path.mkdir(exist_ok=True) extract_path.mkdir(exist_ok=True)
@ -44,7 +43,7 @@ def extract_zip(zip_path: Path, extract_path: Path) -> None:
def convert_to_parquet(data_path: Path, parquet_path: Path) -> None: def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
df = pl.scan_csv(data_path / 'Data/NSPL_MAY_2025_UK.csv', try_parse_dates=True) df = pl.scan_csv(data_path / "Data/NSPL_MAY_2025_UK.csv", try_parse_dates=True)
print(f"Columns: {df.collect_schema().names()}") print(f"Columns: {df.collect_schema().names()}")
parquet_path.parent.mkdir(parents=True, exist_ok=True) parquet_path.parent.mkdir(parents=True, exist_ok=True)
df.sink_parquet(parquet_path, compression="zstd") df.sink_parquet(parquet_path, compression="zstd")
@ -52,8 +51,12 @@ def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
def main() -> None: def main() -> None:
parser = argparse.ArgumentParser(description="Download and convert ArcGIS postcode data") parser = argparse.ArgumentParser(
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path") description="Download and convert ArcGIS postcode data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args() args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir: with tempfile.TemporaryDirectory() as cache_dir:
@ -64,5 +67,6 @@ def main() -> None:
extract_zip(download_path, extract_path) extract_zip(download_path, extract_path)
convert_to_parquet(extract_path, args.output) convert_to_parquet(extract_path, args.output)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -41,8 +41,12 @@ def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
def main() -> None: def main() -> None:
parser = argparse.ArgumentParser(description="Download and convert Index of Deprivation data") parser = argparse.ArgumentParser(
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path") description="Download and convert Index of Deprivation data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args() args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir: with tempfile.TemporaryDirectory() as cache_dir:

View file

@ -8,16 +8,12 @@ import osmium
import polars as pl import polars as pl
from tqdm import tqdm from tqdm import tqdm
from pathlib import Path
BATCH_SIZE = 50_000 BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20 MIN_OCCURENCE_COUNT = 20
GEOFABRIK_GB_URL = ( GEOFABRIK_GB_URL = "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
"https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
)
UK_BBOX_WEST = -7.57 UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96 UK_BBOX_SOUTH = 49.96
@ -38,7 +34,6 @@ POI_TAG_KEYS: list[str] = [
] ]
def download_pbf(pbf_file: Path) -> None: def download_pbf(pbf_file: Path) -> None:
pbf_file.parent.mkdir(parents=True, exist_ok=True) pbf_file.parent.mkdir(parents=True, exist_ok=True)
tmp = pbf_file.with_suffix(".pbf.tmp") tmp = pbf_file.with_suffix(".pbf.tmp")
@ -91,7 +86,12 @@ class POIHandler(osmium.SimpleHandler):
self._batch.clear() self._batch.clear()
def _add_poi( def _add_poi(
self, osm_id: str, tags: osmium.osm.TagList, category: str, lat: float, lng: float self,
osm_id: str,
tags: osmium.osm.TagList,
category: str,
lat: float,
lng: float,
) -> None: ) -> None:
self._batch.append( self._batch.append(
{ {
@ -123,8 +123,12 @@ class POIHandler(osmium.SimpleHandler):
def main() -> None: def main() -> None:
parser = argparse.ArgumentParser(description="Download and extract POIs from OpenStreetMap") parser = argparse.ArgumentParser(
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path") description="Download and extract POIs from OpenStreetMap"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args() args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir: with tempfile.TemporaryDirectory() as cache_dir:

View file

@ -73,8 +73,12 @@ def convert_to_parquet(csv_path: Path, parquet_path: Path) -> None:
def main() -> None: def main() -> None:
parser = argparse.ArgumentParser(description="Download and convert Land Registry price-paid data") parser = argparse.ArgumentParser(
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path") description="Download and convert Land Registry price-paid data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args() args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir: with tempfile.TemporaryDirectory() as cache_dir:

View file

@ -1,11 +1,7 @@
"""Configuration constants for journey times processing.""" """Configuration constants for journey times processing."""
from pathlib import Path
from .models import Destination from .models import Destination
DATA_DIR = Path("./data_sources")
OUTPUT_DIR = DATA_DIR / "processed"
MAX_DELAY = 10 MAX_DELAY = 10
REQUESTS_PER_MIN = 500 REQUESTS_PER_MIN = 500

View file

@ -99,9 +99,7 @@ async def fetch_journey_for_mode(
journeys = data.get("journeys", []) journeys = data.get("journeys", [])
if journeys: if journeys:
durations = [ durations = [
j["duration"] j["duration"] for j in journeys if j.get("duration") is not None
for j in journeys
if j.get("duration") is not None
] ]
if durations: if durations:
return min(durations) return min(durations)

View file

@ -9,79 +9,108 @@ pl.Config.set_tbl_cols(-1)
def main(): def main():
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data") parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
parser.add_argument("--epc", type=Path, required=True, help="EPC certificates CSV file") parser.add_argument(
parser.add_argument("--price-paid", type=Path, required=True, help="Price paid parquet file") "--epc", type=Path, required=True, help="EPC certificates CSV file"
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path") )
parser.add_argument(
"--price-paid", type=Path, required=True, help="Price paid parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args() args = parser.parse_args()
epc = pl.scan_csv(args.epc).select( epc = (
pl.col('ADDRESS').alias('epc_address'), pl.scan_csv(args.epc)
'POSTCODE', .select(
'CURRENT_ENERGY_RATING', pl.col("ADDRESS").alias("epc_address"),
'POTENTIAL_ENERGY_RATING', "POSTCODE",
pl.col('PROPERTY_TYPE').alias('epc_property_type'), "CURRENT_ENERGY_RATING",
'BUILT_FORM', "POTENTIAL_ENERGY_RATING",
'INSPECTION_DATE', pl.col("PROPERTY_TYPE").alias("epc_property_type"),
'TOTAL_FLOOR_AREA', "BUILT_FORM",
'NUMBER_HABITABLE_ROOMS', "INSPECTION_DATE",
'FLOOR_HEIGHT', "TOTAL_FLOOR_AREA",
'CONSTRUCTION_AGE_BAND' "NUMBER_HABITABLE_ROOMS",
).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first() "FLOOR_HEIGHT",
"CONSTRUCTION_AGE_BAND",
)
.filter(pl.col("epc_address").is_not_null())
.sort("INSPECTION_DATE", descending=True)
.group_by("epc_address", "POSTCODE")
.first()
)
print("EPC dataset") print("EPC dataset")
print(epc.head().collect()) print(epc.head().collect())
# https://www.gov.uk/guidance/about-the-price-paid-data # https://www.gov.uk/guidance/about-the-price-paid-data
property_type_map = {"D": "Detached", "S": "Semi-Detached", "T": "Terraced", "F": "Flats/Maisonettes", "O": "Other"} property_type_map = {
"D": "Detached",
"S": "Semi-Detached",
"T": "Terraced",
"F": "Flats/Maisonettes",
"O": "Other",
}
duration_map = {"F": "Freehold", "L": "Leasehold"} duration_map = {"F": "Freehold", "L": "Leasehold"}
price_paid = (pl.scan_parquet(args.price_paid).select( price_paid = (
"price", pl.scan_parquet(args.price_paid)
"date_of_transfer", .select(
pl.col('property_type').alias("pp_property_type").replace(property_type_map), "price",
"postcode", "date_of_transfer",
'paon', pl.col("property_type")
'saon', .alias("pp_property_type")
'street', .replace(property_type_map),
'locality', "postcode",
'town_city', "paon",
pl.col('duration').replace(duration_map) "saon",
) "street",
.filter(pl.col('pp_property_type') != 'Other').with_columns( "locality",
pl.concat_str( "town_city",
[pl.col('saon'), pl.col('paon'), pl.col('street')], pl.col("duration").replace(duration_map),
separator=' ',
ignore_nulls=True,
).alias('pp_address'),
) )
.sort('date_of_transfer') .filter(pl.col("pp_property_type") != "Other")
.group_by('pp_address', 'postcode', maintain_order=True) .with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
)
.sort("date_of_transfer")
.group_by("pp_address", "postcode", maintain_order=True)
.agg( .agg(
pl.struct( pl.struct(
pl.col('date_of_transfer').dt.year().alias('year'), pl.col("date_of_transfer").dt.year().alias("year"),
'price', "price",
).alias('historical_prices'), ).alias("historical_prices"),
pl.col('pp_property_type').last(), pl.col("pp_property_type").last(),
pl.col('duration').last(), pl.col("duration").last(),
pl.col('price').last().alias('latest_price'), pl.col("price").last().alias("latest_price"),
pl.col('date_of_transfer').last(), pl.col("date_of_transfer").last(),
) )
).filter(pl.col('pp_address').is_not_null()) ).filter(pl.col("pp_address").is_not_null())
print("Price paid dataset") print("Price paid dataset")
print(price_paid.head().collect()) print(price_paid.head().collect())
joined = fuzzy_join_on_postcode( joined = (
left=price_paid, fuzzy_join_on_postcode(
right=epc, left=price_paid,
left_address_col='pp_address', right=epc,
right_address_col='epc_address', left_address_col="pp_address",
left_postcode_col='postcode', right_address_col="epc_address",
right_postcode_col='POSTCODE', left_postcode_col="postcode",
).drop('POSTCODE').collect() right_postcode_col="POSTCODE",
)
.drop("POSTCODE")
.collect()
)
matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null()) matched = joined.filter(
pl.col("epc_address").is_not_null() & pl.col("pp_address").is_not_null()
)
total = joined.height total = joined.height
print(f"Unique properties: {total}") print(f"Unique properties: {total}")
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)") print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")

View file

@ -24,7 +24,9 @@ def _build_wide(
"lsoa21", "lsoa21",
) )
wide = wide.join(arcgis, on="postcode", how="inner") wide = wide.join(arcgis, on="postcode", how="inner")
print(f" {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB") print(
f" {wide.shape[0]:,} rows after GPS join, {wide.estimated_size('mb'):.1f} MB"
)
# Journey times (optional) # Journey times (optional)
if journey_times_path and journey_times_path.exists(): if journey_times_path and journey_times_path.exists():
@ -42,9 +44,7 @@ def _build_wide(
if iod_path and iod_path.exists(): if iod_path and iod_path.exists():
print("Joining IoD scores...") print("Joining IoD scores...")
iod = pl.read_parquet(iod_path) iod = pl.read_parquet(iod_path)
wide = wide.join( wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left"
)
print(f" {wide.estimated_size('mb'):.1f} MB after IoD") print(f" {wide.estimated_size('mb'):.1f} MB after IoD")
# POI proximity counts (pre-computed per postcode) # POI proximity counts (pre-computed per postcode)
@ -66,44 +66,68 @@ def _build_wide(
) )
# Derived columns # Derived columns
wide = wide.with_columns( wide = (
(pl.col("latest_price") / pl.col("total_floor_area")).alias("Price per sqm"), wide.with_columns(
).drop( (pl.col("latest_price") / pl.col("total_floor_area")).alias(
'date_of_transfer', "Price per sqm"
'inspection_date', ),
'floor_height', )
'lsoa21', .drop(
'LSOA code (2021)', "date_of_transfer",
'Local Authority District code (2024)', "inspection_date",
'Local Authority District name (2024)', "floor_height",
'imd_score', "lsoa21",
'housing_barriers_score', "LSOA code (2021)",
'idaci_score', "Local Authority District code (2024)",
'idaopi_score', "Local Authority District name (2024)",
'children_young_people_score', "imd_score",
'adult_skills_score', "housing_barriers_score",
'geographical_barriers_score', "idaci_score",
'wider_barriers_score', "idaopi_score",
).rename({ "children_young_people_score",
'construction_age_band': "Approximate construction age", "adult_skills_score",
"income_score": "Income Score (rate)", "geographical_barriers_score",
"employment_score": "Employment Score (rate)", "wider_barriers_score",
"education_score": "Education, Skills and Training Score", )
"health_score": "Health Deprivation and Disability Score", .rename(
"crime_score": "Crime Score", {
}) "construction_age_band": "Approximate construction age",
"income_score": "Income Score (rate)",
"employment_score": "Employment Score (rate)",
"education_score": "Education, Skills and Training Score",
"health_score": "Health Deprivation and Disability Score",
"crime_score": "Crime Score",
}
)
)
return wide return wide
def main(): def main():
parser = argparse.ArgumentParser(description="Build wide property dataframe with all joins") parser = argparse.ArgumentParser(
parser.add_argument("--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file") description="Build wide property dataframe with all joins"
parser.add_argument("--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file") )
parser.add_argument("--iod", type=Path, help="Index of Deprivation parquet file (optional)") parser.add_argument(
parser.add_argument("--poi-proximity", type=Path, help="POI proximity counts parquet file (optional)") "--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
parser.add_argument("--journey-times", type=Path, help="Journey times parquet file (optional)") )
parser.add_argument("--output", type=Path, required=True, help="Output parquet file path") parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
)
parser.add_argument(
"--iod", type=Path, help="Index of Deprivation parquet file (optional)"
)
parser.add_argument(
"--poi-proximity",
type=Path,
help="POI proximity counts parquet file (optional)",
)
parser.add_argument(
"--journey-times", type=Path, help="Journey times parquet file (optional)"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args() args = parser.parse_args()
wide = _build_wide( wide = _build_wide(
@ -119,7 +143,7 @@ def main():
wide.write_parquet(args.output) wide.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024) size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)") print(f"Wrote {args.output} ({size_mb:.1f} MB)")

View file

@ -584,9 +584,7 @@ def transform(input_path: Path) -> pl.LazyFrame:
if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP: if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP:
unmapped.append(cat) unmapped.append(cat)
if unmapped: if unmapped:
raise ValueError( raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}"
)
# Verify every CATEGORY_MAP key actually exists in the data (catch typos) # Verify every CATEGORY_MAP key actually exists in the data (catch typos)
mapped_but_absent = [] mapped_but_absent = []
@ -623,9 +621,15 @@ def transform(input_path: Path) -> pl.LazyFrame:
def main(): def main():
parser = argparse.ArgumentParser(description="Transform raw POIs to filtered version with friendly names") parser = argparse.ArgumentParser(
parser.add_argument("--input", type=Path, required=True, help="Raw POIs parquet file") description="Transform raw POIs to filtered version with friendly names"
parser.add_argument("--output", type=Path, required=True, help="Output filtered POIs parquet file") )
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args() args = parser.parse_args()
df = transform(args.input).collect() df = transform(args.input).collect()

View file

@ -2,4 +2,10 @@ from .fuzzy_join import fuzzy_join_on_postcode
from .haversine import haversine_km, haversine_km_expr from .haversine import haversine_km, haversine_km_expr
from .poi_counts import POI_GROUPS, count_pois_within_radius from .poi_counts import POI_GROUPS, count_pois_within_radius
__all__ = ["fuzzy_join_on_postcode", "haversine_km", "haversine_km_expr", "POI_GROUPS", "count_pois_within_radius"] __all__ = [
"fuzzy_join_on_postcode",
"haversine_km",
"haversine_km_expr",
"POI_GROUPS",
"count_pois_within_radius",
]

View file

@ -9,14 +9,14 @@ import polars as pl
from thefuzz import fuzz from thefuzz import fuzz
from tqdm import tqdm from tqdm import tqdm
_NUMBER_RE = re.compile(r'\d+') _NUMBER_RE = re.compile(r"\d+")
def _normalize(s: pl.Expr) -> pl.Expr: def _normalize(s: pl.Expr) -> pl.Expr:
return ( return (
s.str.to_uppercase() s.str.to_uppercase()
.str.replace_all(r'[,.\-]', ' ') .str.replace_all(r"[,.\-]", " ")
.str.replace_all(r'\s+', ' ') .str.replace_all(r"\s+", " ")
.str.strip_chars() .str.strip_chars()
) )
@ -40,22 +40,25 @@ def fuzzy_join_on_postcode(
have null right columns. have null right columns.
""" """
tmpdir = tempfile.mkdtemp(prefix='fuzzy_join_') tmpdir = tempfile.mkdtemp(prefix="fuzzy_join_")
left_path = Path(tmpdir) / 'left.parquet' left_path = Path(tmpdir) / "left.parquet"
right_path = Path(tmpdir) / 'right.parquet' right_path = Path(tmpdir) / "right.parquet"
try: try:
# Materialise each side exactly once, with a row index, to temp parquet. # Materialise each side exactly once, with a row index, to temp parquet.
left.with_row_index('_left_idx').sink_parquet(left_path) left.with_row_index("_left_idx").sink_parquet(left_path)
right.with_row_index('_right_idx').sink_parquet(right_path) right.with_row_index("_right_idx").sink_parquet(right_path)
# Collect only the narrow columns needed for matching (projection pushdown). # Collect only the narrow columns needed for matching (projection pushdown).
left_match = ( left_match = (
pl.scan_parquet(left_path) pl.scan_parquet(left_path)
.select( .select(
'_left_idx', "_left_idx",
_normalize(pl.col(left_address_col)).alias('_left_address'), _normalize(pl.col(left_address_col)).alias("_left_address"),
pl.col(left_postcode_col).str.strip_chars().str.to_uppercase().alias('_left_postcode'), pl.col(left_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_left_postcode"),
) )
.collect() .collect()
) )
@ -63,18 +66,23 @@ def fuzzy_join_on_postcode(
right_match = ( right_match = (
pl.scan_parquet(right_path) pl.scan_parquet(right_path)
.select( .select(
'_right_idx', "_right_idx",
_normalize(pl.col(right_address_col)).alias('_right_address'), _normalize(pl.col(right_address_col)).alias("_right_address"),
pl.col(right_postcode_col).str.strip_chars().str.to_uppercase().alias('_right_postcode'), pl.col(right_postcode_col)
.str.strip_chars()
.str.to_uppercase()
.alias("_right_postcode"),
) )
.unique(subset=['_right_address', '_right_postcode'], keep='first') .unique(subset=["_right_address", "_right_postcode"], keep="first")
.collect() .collect()
) )
# Group right side by postcode for fast lookup # Group right side by postcode for fast lookup
right_by_postcode: dict[str, list[tuple[int, str]]] = {} right_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip( for idx, postcode, address in zip(
right_match['_right_idx'], right_match['_right_postcode'], right_match['_right_address'] right_match["_right_idx"],
right_match["_right_postcode"],
right_match["_right_address"],
): ):
if postcode is not None: if postcode is not None:
right_by_postcode.setdefault(postcode, []).append((idx, address)) right_by_postcode.setdefault(postcode, []).append((idx, address))
@ -82,7 +90,9 @@ def fuzzy_join_on_postcode(
# Group left side by postcode # Group left side by postcode
left_by_postcode: dict[str, list[tuple[int, str]]] = {} left_by_postcode: dict[str, list[tuple[int, str]]] = {}
for idx, postcode, address in zip( for idx, postcode, address in zip(
left_match['_left_idx'], left_match['_left_postcode'], left_match['_left_address'] left_match["_left_idx"],
left_match["_left_postcode"],
left_match["_left_address"],
): ):
if address is not None and postcode is not None: if address is not None and postcode is not None:
left_by_postcode.setdefault(postcode, []).append((idx, address)) left_by_postcode.setdefault(postcode, []).append((idx, address))
@ -103,7 +113,7 @@ def fuzzy_join_on_postcode(
for pairs in tqdm( for pairs in tqdm(
executor.map(_score_bucket, tasks, chunksize=64), executor.map(_score_bucket, tasks, chunksize=64),
total=len(tasks), total=len(tasks),
desc='Fuzzy matching', desc="Fuzzy matching",
): ):
all_pairs.extend(pairs) all_pairs.extend(pairs)
@ -127,24 +137,27 @@ def fuzzy_join_on_postcode(
# Build a small mapping LazyFrame and join back to the cached parquets. # Build a small mapping LazyFrame and join back to the cached parquets.
if matches: if matches:
mapping = pl.LazyFrame({ mapping = pl.LazyFrame(
'_left_idx': pl.Series([m[0] for m in matches], dtype=pl.UInt32), {
'_right_idx': pl.Series([m[1] for m in matches], dtype=pl.UInt32), "_left_idx": pl.Series([m[0] for m in matches], dtype=pl.UInt32),
}) "_right_idx": pl.Series([m[1] for m in matches], dtype=pl.UInt32),
}
)
else: else:
mapping = pl.LazyFrame({ mapping = pl.LazyFrame(
'_left_idx': pl.Series([], dtype=pl.UInt32), {
'_right_idx': pl.Series([], dtype=pl.UInt32), "_left_idx": pl.Series([], dtype=pl.UInt32),
}) "_right_idx": pl.Series([], dtype=pl.UInt32),
}
)
left_cached = pl.scan_parquet(left_path) left_cached = pl.scan_parquet(left_path)
right_cached = pl.scan_parquet(right_path) right_cached = pl.scan_parquet(right_path)
return ( return (
left_cached left_cached.join(mapping, on="_left_idx", how="left")
.join(mapping, on='_left_idx', how='left') .join(right_cached, on="_right_idx", how="left")
.join(right_cached, on='_right_idx', how='left') .drop("_left_idx", "_right_idx")
.drop('_left_idx', '_right_idx')
) )
except BaseException: except BaseException:
shutil.rmtree(tmpdir, ignore_errors=True) shutil.rmtree(tmpdir, ignore_errors=True)
@ -158,7 +171,9 @@ def _numbers_compatible(a: str, b: str) -> bool:
""" """
nums_a = set(_NUMBER_RE.findall(a)) nums_a = set(_NUMBER_RE.findall(a))
nums_b = set(_NUMBER_RE.findall(b)) nums_b = set(_NUMBER_RE.findall(b))
smaller, larger = (nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a) smaller, larger = (
(nums_a, nums_b) if len(nums_a) <= len(nums_b) else (nums_b, nums_a)
)
if not smaller and larger: if not smaller and larger:
return False return False
return smaller.issubset(larger) return smaller.issubset(larger)

View file

@ -6,7 +6,9 @@ import polars as pl
_EARTH_RADIUS_KM = 6371.0 _EARTH_RADIUS_KM = 6371.0
def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -> np.ndarray: def haversine_km(
lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float
) -> np.ndarray:
"""Compute haversine distance in km between arrays (lat1, lon1) and a single point (lat2, lon2).""" """Compute haversine distance in km between arrays (lat1, lon1) and a single point (lat2, lon2)."""
lat1_rad = np.radians(lat1) lat1_rad = np.radians(lat1)
lon1_rad = np.radians(lon1) lon1_rad = np.radians(lon1)
@ -14,7 +16,10 @@ def haversine_km(lat1: np.ndarray, lon1: np.ndarray, lat2: float, lon2: float) -
lon2_rad = np.radians(lon2) lon2_rad = np.radians(lon2)
dlat = lat2_rad - lat1_rad dlat = lat2_rad - lat1_rad
dlon = lon2_rad - lon1_rad dlon = lon2_rad - lon1_rad
a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2 a = (
np.sin(dlat / 2) ** 2
+ np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
)
c = 2 * np.arcsin(np.sqrt(a)) c = 2 * np.arcsin(np.sqrt(a))
return _EARTH_RADIUS_KM * c return _EARTH_RADIUS_KM * c
@ -32,5 +37,7 @@ def haversine_km_expr(
dlat = pl.lit(dest_lat_rad) - lat_rad dlat = pl.lit(dest_lat_rad) - lat_rad
dlon = pl.lit(dest_lon_rad) - lon_rad dlon = pl.lit(dest_lon_rad) - lon_rad
a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (dlon / 2).sin() ** 2 a = (dlat / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * lat_rad.cos() * (
dlon / 2
).sin() ** 2
return 2 * _EARTH_RADIUS_KM * a.sqrt().arcsin() return 2 * _EARTH_RADIUS_KM * a.sqrt().arcsin()

View file

@ -70,7 +70,9 @@ def _count_pois_per_postcode(
pc_codes = postcodes_df["postcode"].to_list() pc_codes = postcodes_df["postcode"].to_list()
# Initialize result arrays # Initialize result arrays
result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS} result_counts = {
group: np.zeros(n_postcodes, dtype=np.int32) for group in POI_GROUPS
}
# Process in batches with progress # Process in batches with progress
batch_size = 50000 batch_size = 50000
@ -83,7 +85,9 @@ def _count_pois_per_postcode(
end_idx = min(start_idx + batch_size, n_postcodes) end_idx = min(start_idx + batch_size, n_postcodes)
if batch_idx % 5 == 0: if batch_idx % 5 == 0:
print(f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}") print(
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
)
# Process batch # Process batch
for i in range(start_idx, end_idx): for i in range(start_idx, end_idx):
@ -109,12 +113,7 @@ def _count_pois_per_postcode(
nearby = np.concatenate(nearby_indices) nearby = np.concatenate(nearby_indices)
# Vectorized distance calculation for all nearby POIs # Vectorized distance calculation for all nearby POIs
distances = haversine_km( distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lat, pc_lon)
poi_lats[nearby],
poi_lngs[nearby],
pc_lat,
pc_lon
)
# Filter by radius # Filter by radius
within_mask = distances <= radius_km within_mask = distances <= radius_km
@ -147,13 +146,13 @@ def count_pois_within_radius(
""" """
# Get unique postcodes with coordinates # Get unique postcodes with coordinates
print("Deduplicating postcodes...") print("Deduplicating postcodes...")
unique_postcodes = ( unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
properties subset=["postcode"]
.select(["postcode", "lat", "lon"])
.unique(subset=["postcode"])
) )
print(f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes") print(
f" {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
)
# Count POIs per postcode # Count POIs per postcode
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km) postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
@ -174,11 +173,7 @@ def count_pois_within_radius(
result_lazy = ( result_lazy = (
properties.lazy() properties.lazy()
.select("postcode") .select("postcode")
.join( .join(pl.scan_parquet(tmp_path), on="postcode", how="left")
pl.scan_parquet(tmp_path),
on="postcode",
how="left"
)
.select(count_cols) .select(count_cols)
.fill_null(0) .fill_null(0)
) )

View file

@ -41,6 +41,6 @@ result = fuzzy_join_on_postcode(
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address") snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
print('Testing the matching between EPC and PP addresses') print("Testing the matching between EPC and PP addresses")
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80): with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
print(snapshot) print(snapshot)

View file

@ -73,29 +73,39 @@ class TestHaversineKmExpr:
def test_same_point(self): def test_same_point(self):
"""Distance from a point to itself should be zero.""" """Distance from a point to itself should be zero."""
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]}) df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")) result = df.select(
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
)
assert result["dist"][0] == pytest.approx(0.0, abs=1e-10) assert result["dist"][0] == pytest.approx(0.0, abs=1e-10)
def test_known_distance_london_to_paris(self): def test_known_distance_london_to_paris(self):
"""Test distance from London to Paris (~344 km).""" """Test distance from London to Paris (~344 km)."""
df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]}) df = pl.DataFrame({"lat": [51.5074], "lon": [-0.1278]})
result = df.select(haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist")) result = df.select(
haversine_km_expr("lat", "lon", 48.8566, 2.3522).alias("dist")
)
assert result["dist"][0] == pytest.approx(344, rel=0.01) assert result["dist"][0] == pytest.approx(344, rel=0.01)
def test_known_distance_new_york_to_london(self): def test_known_distance_new_york_to_london(self):
"""Test distance from New York to London (~5570 km).""" """Test distance from New York to London (~5570 km)."""
df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]}) df = pl.DataFrame({"lat": [40.7128], "lon": [-74.0060]})
result = df.select(haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")) result = df.select(
haversine_km_expr("lat", "lon", 51.5074, -0.1278).alias("dist")
)
assert result["dist"][0] == pytest.approx(5570, rel=0.01) assert result["dist"][0] == pytest.approx(5570, rel=0.01)
def test_multiple_points(self): def test_multiple_points(self):
"""Test calculating distances from multiple points to a single destination.""" """Test calculating distances from multiple points to a single destination."""
df = pl.DataFrame({ df = pl.DataFrame(
"lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC {
"lon": [-0.1278, 2.3522, -74.0060], "lat": [51.5074, 48.8566, 40.7128], # London, Paris, NYC
}) "lon": [-0.1278, 2.3522, -74.0060],
}
)
# Distance to Edinburgh # Distance to Edinburgh
result = df.select(haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist")) result = df.select(
haversine_km_expr("lat", "lon", 55.9533, -3.1883).alias("dist")
)
dists = result["dist"].to_numpy() dists = result["dist"].to_numpy()
# All distances should be positive # All distances should be positive
@ -128,7 +138,9 @@ class TestHaversineConsistency:
# Polars version # Polars version
df = pl.DataFrame({"lat": lats, "lon": lons}) df = pl.DataFrame({"lat": lats, "lon": lons})
polars_result = df.select(haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist")) polars_result = df.select(
haversine_km_expr("lat", "lon", dest_lat, dest_lon).alias("dist")
)
polars_dists = polars_result["dist"].to_numpy() polars_dists = polars_result["dist"].to_numpy()
# Should be identical (or at least very close due to floating point) # Should be identical (or at least very close due to floating point)

View file

@ -7,28 +7,32 @@ from pipeline.utils.poi_counts import POI_GROUPS, count_pois_within_radius
@pytest.fixture @pytest.fixture
def pois(): def pois():
"""POIs clustered around two locations: central London and 10km away.""" """POIs clustered around two locations: central London and 10km away."""
return pl.DataFrame({ return pl.DataFrame(
"lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60], {
"lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20], "lat": [51.5074, 51.5075, 51.5080, 51.5076, 51.5073, 51.60],
"category": [ "lng": [-0.1278, -0.1280, -0.1275, -0.1279, -0.1277, -0.20],
"Restaurant", "category": [
"Fast Food", "Restaurant",
"Supermarket", "Fast Food",
"Park", "Supermarket",
"Station", "Park",
"Restaurant", # too far from any property "Station",
], "Restaurant", # too far from any property
}) ],
}
)
@pytest.fixture @pytest.fixture
def properties(): def properties():
"""Two properties at the same postcode near central London, one at a distant postcode.""" """Two properties at the same postcode near central London, one at a distant postcode."""
return pl.DataFrame({ return pl.DataFrame(
"postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"], {
"lat": [51.5074, 51.5074, 55.0], "postcode": ["EC1A 1BB", "EC1A 1BB", "ZZ99 9ZZ"],
"lon": [-0.1278, -0.1278, -3.0], "lat": [51.5074, 51.5074, 55.0],
}) "lon": [-0.1278, -0.1278, -3.0],
}
)
def test_counts_pois_within_radius(properties, pois): def test_counts_pois_within_radius(properties, pois):
@ -41,9 +45,9 @@ def test_counts_pois_within_radius(properties, pois):
assert len(series) == 3, f"{col} has {len(series)} rows, expected 3" assert len(series) == 3, f"{col} has {len(series)} rows, expected 3"
# First two rows share a postcode near the central London cluster # First two rows share a postcode near the central London cluster
assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food assert result["restaurants_2km"][0] == 2 # Restaurant + Fast Food
assert result["groceries_2km"][0] == 1 # Supermarket assert result["groceries_2km"][0] == 1 # Supermarket
assert result["parks_2km"][0] == 1 # Park assert result["parks_2km"][0] == 1 # Park
assert result["public_transport_2km"][0] == 1 # Station assert result["public_transport_2km"][0] == 1 # Station
# Second row is the same postcode, so same counts # Second row is the same postcode, so same counts
@ -55,11 +59,13 @@ def test_counts_pois_within_radius(properties, pois):
def test_no_pois_returns_zeros(properties): def test_no_pois_returns_zeros(properties):
empty_pois = pl.DataFrame({ empty_pois = pl.DataFrame(
"lat": pl.Series([], dtype=pl.Float64), {
"lng": pl.Series([], dtype=pl.Float64), "lat": pl.Series([], dtype=pl.Float64),
"category": pl.Series([], dtype=pl.String), "lng": pl.Series([], dtype=pl.Float64),
}) "category": pl.Series([], dtype=pl.String),
}
)
result = count_pois_within_radius(properties, empty_pois, radius_km=2.0) result = count_pois_within_radius(properties, empty_pois, radius_km=2.0)
for group in POI_GROUPS: for group in POI_GROUPS:
@ -70,11 +76,13 @@ def test_no_pois_returns_zeros(properties):
def test_custom_radius(pois): def test_custom_radius(pois):
"""A tiny radius should exclude POIs that are even slightly away.""" """A tiny radius should exclude POIs that are even slightly away."""
properties = pl.DataFrame({ properties = pl.DataFrame(
"postcode": ["EC1A 1BB"], {
"lat": [51.5074], "postcode": ["EC1A 1BB"],
"lon": [-0.1278], "lat": [51.5074],
}) "lon": [-0.1278],
}
)
# 0.01 km = 10m — only the POI at the exact same location should match # 0.01 km = 10m — only the POI at the exact same location should match
result = count_pois_within_radius(properties, pois, radius_km=0.01) result = count_pois_within_radius(properties, pois, radius_km=0.01)