Can't even keep track anymore
This commit is contained in:
parent
dccc1e439d
commit
3a3f899ea2
50 changed files with 1144 additions and 560 deletions
|
|
@ -99,26 +99,26 @@ def main():
|
|||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, default=None, help="Path to existing PBF file"
|
||||
"--pbf", type=Path, required=True, help="Path to existing PBF file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.pbf and args.pbf.exists():
|
||||
if args.pbf.exists():
|
||||
pbf_file = args.pbf
|
||||
print(f"Using existing PBF: {pbf_file}")
|
||||
else:
|
||||
pbf_file = Path("data/great-britain-latest.osm.pbf")
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
else:
|
||||
print(f"Using cached PBF: {pbf_file}")
|
||||
download_pbf(args.pbf)
|
||||
|
||||
print("Extracting greenspace/water areas from PBF (two-pass area assembly)...")
|
||||
with tqdm(unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05) as progress:
|
||||
with tqdm(
|
||||
unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05
|
||||
) as progress:
|
||||
handler = GreenspaceHandler(progress)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
|
||||
print(f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm")
|
||||
print(
|
||||
f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm"
|
||||
)
|
||||
|
||||
# Merge overlapping geometries per 10km grid cell for efficiency
|
||||
if handler.geometries:
|
||||
|
|
|
|||
99
pipeline/download/places.py
Normal file
99
pipeline/download/places.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
"""Extract place=* nodes from OSM PBF → data/places.parquet.
|
||||
|
||||
Extracts named place nodes (cities, towns, suburbs, etc.) for typeahead search.
|
||||
Reuses the same great-britain-latest.osm.pbf as pois.py.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from tqdm import tqdm
|
||||
|
||||
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST, download_pbf
|
||||
|
||||
PLACE_TYPES = {
|
||||
"city",
|
||||
"borough",
|
||||
"town",
|
||||
"suburb",
|
||||
"neighbourhood",
|
||||
"village",
|
||||
"hamlet",
|
||||
"locality",
|
||||
"isolated_dwelling",
|
||||
}
|
||||
|
||||
|
||||
class PlaceHandler(osmium.SimpleHandler):
|
||||
def __init__(self, progress: tqdm) -> None:
|
||||
super().__init__()
|
||||
self._progress = progress
|
||||
self.places: list[dict] = []
|
||||
|
||||
def node(self, n: osmium.osm.Node) -> None:
|
||||
self._progress.update(1)
|
||||
if not n.location.valid:
|
||||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
|
||||
return
|
||||
place_type = n.tags.get("place")
|
||||
if place_type not in PLACE_TYPES:
|
||||
return
|
||||
name = n.tags.get("name:en", n.tags.get("name", ""))
|
||||
if not name:
|
||||
return
|
||||
self.places.append(
|
||||
{"name": name, "place_type": place_type, "lat": lat, "lon": lon}
|
||||
)
|
||||
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract place names from OSM PBF"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.pbf and args.pbf.exists():
|
||||
pbf_file = args.pbf
|
||||
print(f"Using existing PBF: {pbf_file}")
|
||||
else:
|
||||
pbf_file = Path("data/great-britain-latest.osm.pbf")
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
else:
|
||||
print(f"Using cached PBF: {pbf_file}")
|
||||
|
||||
print(f"Extracting place nodes: {sorted(PLACE_TYPES)}")
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
unit_scale=True,
|
||||
desc="Streaming",
|
||||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = PlaceHandler(progress)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
|
||||
print(f"Extracted {len(handler.places):,} place nodes")
|
||||
|
||||
if handler.places:
|
||||
df = pl.DataFrame(handler.places)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(args.output)
|
||||
print(f"Saved to {args.output}")
|
||||
else:
|
||||
print("No places found — skipping output")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
89
pipeline/download/rental_prices.py
Normal file
89
pipeline/download/rental_prices.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
import argparse
|
||||
import tempfile
|
||||
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
|
||||
from pipeline.utils import download
|
||||
|
||||
URL = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland/october2022toseptember2023/privaterentalmarketstatistics231220.xls"
|
||||
|
||||
# Sheets 12-16 are LA-level breakdowns: Studio, 1 Bed, 2 Bed, 3 Bed, 4+ Bed
|
||||
# (Sheet 11 is "Room" — shared house rooms, not self-contained, so skip it)
|
||||
BEDROOM_SHEETS = {
|
||||
12: 0, # Studio
|
||||
13: 1, # One Bedroom
|
||||
14: 2, # Two Bedrooms
|
||||
15: 3, # Three Bedrooms
|
||||
16: 4, # Four or more Bedrooms
|
||||
}
|
||||
|
||||
# Local authority district codes in England
|
||||
LA_PREFIXES = ("E06", "E07", "E08", "E09")
|
||||
|
||||
|
||||
def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
|
||||
"""Read one bedroom category sheet, extract LA-level median rents."""
|
||||
df = pl.read_excel(xls_path, sheet_id=sheet_id)
|
||||
|
||||
# Columns are unnamed; positional:
|
||||
# 0=LA Code, 1=Area Code, 2=Area Name, 3=Count, 4=Mean, 5=LQ, 6=Median, 7=UQ
|
||||
# First 4 rows are headers (title, notes, bedroom label, column headers)
|
||||
df = df.slice(4)
|
||||
|
||||
area_code_col = df.columns[1]
|
||||
median_col = df.columns[6]
|
||||
|
||||
return (
|
||||
df.select(
|
||||
pl.col(area_code_col).alias("area_code"),
|
||||
pl.col(median_col).alias("median_monthly_rent"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("area_code").is_not_null()
|
||||
& pl.col("area_code").str.starts_with("E06")
|
||||
| pl.col("area_code").str.starts_with("E07")
|
||||
| pl.col("area_code").str.starts_with("E08")
|
||||
| pl.col("area_code").str.starts_with("E09")
|
||||
)
|
||||
.with_columns(
|
||||
# Suppressed values are ".." — cast will turn them to null
|
||||
pl.col("median_monthly_rent").cast(pl.Float32, strict=False),
|
||||
pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
|
||||
frames = []
|
||||
for sheet_id, bedrooms in BEDROOM_SHEETS.items():
|
||||
df = _read_sheet(xls_path, sheet_id, bedrooms)
|
||||
print(f" Sheet {sheet_id} (bedrooms={bedrooms}): {df.height} rows")
|
||||
frames.append(df)
|
||||
|
||||
combined = pl.concat(frames)
|
||||
print(f"Combined: {combined.shape}")
|
||||
print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}")
|
||||
print(combined.head(10))
|
||||
|
||||
combined.write_parquet(parquet_path, compression="zstd")
|
||||
print(f"Saved to {parquet_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and convert ONS private rental market statistics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with tempfile.TemporaryDirectory() as cache_dir:
|
||||
xls_path = Path(cache_dir) / "rental_prices.xls"
|
||||
download(URL, xls_path, timeout=60)
|
||||
convert_to_parquet(xls_path, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -6,6 +6,8 @@ from ..utils import fuzzy_join_on_postcode
|
|||
|
||||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
|
||||
|
|
@ -20,7 +22,7 @@ def main():
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
epc = (
|
||||
epc_base = (
|
||||
pl.scan_csv(args.epc)
|
||||
.select(
|
||||
pl.col("ADDRESS").alias("epc_address"),
|
||||
|
|
@ -42,11 +44,90 @@ def main():
|
|||
.otherwise(pl.col("NUMBER_HABITABLE_ROOMS"))
|
||||
.alias("NUMBER_HABITABLE_ROOMS"),
|
||||
)
|
||||
.sort("INSPECTION_DATE", descending=True)
|
||||
)
|
||||
|
||||
# Dedup fork: keep latest certificate per property (existing logic)
|
||||
epc = (
|
||||
epc_base.sort("INSPECTION_DATE", descending=True)
|
||||
.group_by("epc_address", "POSTCODE")
|
||||
.first()
|
||||
)
|
||||
|
||||
# Events fork: detect renovation events between consecutive certificates
|
||||
# Collect eagerly because .over() window functions don't work in streaming
|
||||
# engine (fuzzy_join.py:50 uses sink_parquet which requires streaming).
|
||||
events = (
|
||||
epc_base.sort("INSPECTION_DATE")
|
||||
.with_columns(
|
||||
pl.col("CURRENT_ENERGY_RATING")
|
||||
.replace_strict(RATING_RANK, default=None, return_dtype=pl.Int32)
|
||||
.alias("_rating_rank"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("NUMBER_HABITABLE_ROOMS")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.alias("_prev_rooms"),
|
||||
pl.col("TOTAL_FLOOR_AREA")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.alias("_prev_area"),
|
||||
pl.col("_rating_rank")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.alias("_prev_rating_rank"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.when(
|
||||
pl.col("NUMBER_HABITABLE_ROOMS").is_not_null()
|
||||
& pl.col("_prev_rooms").is_not_null()
|
||||
& (pl.col("NUMBER_HABITABLE_ROOMS") != pl.col("_prev_rooms"))
|
||||
)
|
||||
.then(pl.lit("Remodeling"))
|
||||
.when(
|
||||
pl.col("TOTAL_FLOOR_AREA").is_not_null()
|
||||
& pl.col("_prev_area").is_not_null()
|
||||
& (pl.col("TOTAL_FLOOR_AREA") > pl.col("_prev_area"))
|
||||
)
|
||||
.then(pl.lit("Extension"))
|
||||
.when(
|
||||
pl.col("_rating_rank").is_not_null()
|
||||
& pl.col("_prev_rating_rank").is_not_null()
|
||||
& (pl.col("_rating_rank") < pl.col("_prev_rating_rank"))
|
||||
)
|
||||
.then(pl.lit("Renovation"))
|
||||
.otherwise(pl.lit(None, dtype=pl.String))
|
||||
.alias("_event"),
|
||||
)
|
||||
.filter(pl.col("_event").is_not_null())
|
||||
.with_columns(
|
||||
pl.col("INSPECTION_DATE")
|
||||
.cast(pl.String)
|
||||
.str.slice(0, 4)
|
||||
.cast(pl.Int32)
|
||||
.alias("_event_year"),
|
||||
)
|
||||
.group_by("epc_address", "POSTCODE")
|
||||
.agg(
|
||||
pl.struct(
|
||||
pl.col("_event_year").alias("year"),
|
||||
pl.col("_event").alias("event"),
|
||||
).alias("renovation_history"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
||||
event_counts = events["renovation_history"].explode().struct.field("event").value_counts()
|
||||
print(f"Renovation events: {events.height} properties with events")
|
||||
print(event_counts)
|
||||
|
||||
# Left-join events back onto dedup EPC
|
||||
epc = epc.join(
|
||||
events.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
print("EPC dataset")
|
||||
print(epc.head().collect())
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ def _build_wide(
|
|||
school_proximity_path: Path,
|
||||
broadband_path: Path,
|
||||
geosure_path: Path,
|
||||
rental_prices_path: Path,
|
||||
) -> pl.DataFrame:
|
||||
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
|
||||
wide = (
|
||||
|
|
@ -94,6 +95,21 @@ def _build_wide(
|
|||
how="left",
|
||||
)
|
||||
|
||||
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
|
||||
wide = wide.with_columns(
|
||||
(pl.col("number_habitable_rooms") - 1)
|
||||
.clip(0, 4)
|
||||
.cast(pl.UInt8)
|
||||
.alias("_bedrooms"),
|
||||
)
|
||||
rental = pl.scan_parquet(rental_prices_path)
|
||||
wide = wide.join(
|
||||
rental,
|
||||
left_on=["Local Authority District code (2024)", "_bedrooms"],
|
||||
right_on=["area_code", "bedrooms"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
crime = pl.scan_parquet(crime_path)
|
||||
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
|
||||
|
||||
|
|
@ -208,6 +224,7 @@ def _build_wide(
|
|||
.drop(
|
||||
"inspection_date",
|
||||
"floor_height",
|
||||
"_bedrooms",
|
||||
"LSOA name (2021)",
|
||||
"Local Authority District code (2024)",
|
||||
"Local Authority District name (2024)",
|
||||
|
|
@ -258,6 +275,7 @@ def _build_wide(
|
|||
"running_sand_risk": "Running sand risk",
|
||||
"shrink_swell_risk": "Shrink-swell risk",
|
||||
"soluble_rocks_risk": "Soluble rocks risk",
|
||||
"median_monthly_rent": "Estimated monthly rent",
|
||||
}
|
||||
)
|
||||
)
|
||||
|
|
@ -332,6 +350,12 @@ def main():
|
|||
required=True,
|
||||
help="GeoSure ground stability parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rental-prices",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="ONS rental prices by LA and bedroom count parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
|
|
@ -350,6 +374,7 @@ def main():
|
|||
school_proximity_path=args.school_proximity,
|
||||
broadband_path=args.broadband,
|
||||
geosure_path=args.geosure,
|
||||
rental_prices_path=args.rental_prices,
|
||||
)
|
||||
|
||||
print(f"Columns: {wide.columns}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue