Can't even keep track anymore

This commit is contained in:
Andras Schmelczer 2026-02-13 09:16:28 +00:00
parent dccc1e439d
commit 3a3f899ea2
50 changed files with 1144 additions and 560 deletions

View file

@ -99,26 +99,26 @@ def main():
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, default=None, help="Path to existing PBF file"
"--pbf", type=Path, required=True, help="Path to existing PBF file"
)
args = parser.parse_args()
if args.pbf and args.pbf.exists():
if args.pbf.exists():
pbf_file = args.pbf
print(f"Using existing PBF: {pbf_file}")
else:
pbf_file = Path("data/great-britain-latest.osm.pbf")
if not pbf_file.exists():
download_pbf(pbf_file)
else:
print(f"Using cached PBF: {pbf_file}")
download_pbf(args.pbf)
print("Extracting greenspace/water areas from PBF (two-pass area assembly)...")
with tqdm(unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05) as progress:
with tqdm(
unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05
) as progress:
handler = GreenspaceHandler(progress)
handler.apply_file(str(pbf_file), locations=True)
print(f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm")
print(
f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm"
)
# Merge overlapping geometries per 10km grid cell for efficiency
if handler.geometries:

View file

@ -0,0 +1,99 @@
"""Extract place=* nodes from OSM PBF → data/places.parquet.
Extracts named place nodes (cities, towns, suburbs, etc.) for typeahead search.
Reuses the same great-britain-latest.osm.pbf as pois.py.
"""
import argparse
from pathlib import Path
import osmium
import polars as pl
from tqdm import tqdm
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST, download_pbf
PLACE_TYPES = {
"city",
"borough",
"town",
"suburb",
"neighbourhood",
"village",
"hamlet",
"locality",
"isolated_dwelling",
}
class PlaceHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm) -> None:
super().__init__()
self._progress = progress
self.places: list[dict] = []
def node(self, n: osmium.osm.Node) -> None:
self._progress.update(1)
if not n.location.valid:
return
lat, lon = n.location.lat, n.location.lon
if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
return
place_type = n.tags.get("place")
if place_type not in PLACE_TYPES:
return
name = n.tags.get("name:en", n.tags.get("name", ""))
if not name:
return
self.places.append(
{"name": name, "place_type": place_type, "lat": lat, "lon": lon}
)
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
def main() -> None:
parser = argparse.ArgumentParser(
description="Extract place names from OSM PBF"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
)
args = parser.parse_args()
if args.pbf and args.pbf.exists():
pbf_file = args.pbf
print(f"Using existing PBF: {pbf_file}")
else:
pbf_file = Path("data/great-britain-latest.osm.pbf")
if not pbf_file.exists():
download_pbf(pbf_file)
else:
print(f"Using cached PBF: {pbf_file}")
print(f"Extracting place nodes: {sorted(PLACE_TYPES)}")
with tqdm(
unit=" elements",
unit_scale=True,
desc="Streaming",
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = PlaceHandler(progress)
handler.apply_file(str(pbf_file), locations=True)
print(f"Extracted {len(handler.places):,} place nodes")
if handler.places:
df = pl.DataFrame(handler.places)
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output)
print(f"Saved to {args.output}")
else:
print("No places found — skipping output")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,89 @@
import argparse
import tempfile
import polars as pl
from pathlib import Path
from pipeline.utils import download
URL = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland/october2022toseptember2023/privaterentalmarketstatistics231220.xls"
# Sheets 12-16 are LA-level breakdowns: Studio, 1 Bed, 2 Bed, 3 Bed, 4+ Bed
# (Sheet 11 is "Room" — shared house rooms, not self-contained, so skip it)
BEDROOM_SHEETS = {
12: 0, # Studio
13: 1, # One Bedroom
14: 2, # Two Bedrooms
15: 3, # Three Bedrooms
16: 4, # Four or more Bedrooms
}
# Local authority district codes in England
LA_PREFIXES = ("E06", "E07", "E08", "E09")
def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
"""Read one bedroom category sheet, extract LA-level median rents."""
df = pl.read_excel(xls_path, sheet_id=sheet_id)
# Columns are unnamed; positional:
# 0=LA Code, 1=Area Code, 2=Area Name, 3=Count, 4=Mean, 5=LQ, 6=Median, 7=UQ
# First 4 rows are headers (title, notes, bedroom label, column headers)
df = df.slice(4)
area_code_col = df.columns[1]
median_col = df.columns[6]
return (
df.select(
pl.col(area_code_col).alias("area_code"),
pl.col(median_col).alias("median_monthly_rent"),
)
.filter(
pl.col("area_code").is_not_null()
& pl.col("area_code").str.starts_with("E06")
| pl.col("area_code").str.starts_with("E07")
| pl.col("area_code").str.starts_with("E08")
| pl.col("area_code").str.starts_with("E09")
)
.with_columns(
# Suppressed values are ".." — cast will turn them to null
pl.col("median_monthly_rent").cast(pl.Float32, strict=False),
pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
)
)
def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
frames = []
for sheet_id, bedrooms in BEDROOM_SHEETS.items():
df = _read_sheet(xls_path, sheet_id, bedrooms)
print(f" Sheet {sheet_id} (bedrooms={bedrooms}): {df.height} rows")
frames.append(df)
combined = pl.concat(frames)
print(f"Combined: {combined.shape}")
print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}")
print(combined.head(10))
combined.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and convert ONS private rental market statistics"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
xls_path = Path(cache_dir) / "rental_prices.xls"
download(URL, xls_path, timeout=60)
convert_to_parquet(xls_path, args.output)
if __name__ == "__main__":
main()