This commit is contained in:
Andras Schmelczer 2026-02-15 22:39:49 +00:00
parent 03445188ea
commit 524580eb25
102 changed files with 36625 additions and 1295 deletions

View file

@ -12,7 +12,7 @@ import osmium
import polars as pl
from tqdm import tqdm
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST, download_pbf
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
PLACE_TYPES = {
"city",
@ -74,9 +74,17 @@ class PlaceHandler(osmium.SimpleHandler):
self._progress = progress
self.places: list[dict] = []
def _add(self, name: str, place_type: str, lat: float, lon: float, population: int) -> None:
def _add(
self, name: str, place_type: str, lat: float, lon: float, population: int
) -> None:
self.places.append(
{"name": name, "place_type": place_type, "lat": lat, "lon": lon, "population": population}
{
"name": name,
"place_type": place_type,
"lat": lat,
"lon": lon,
"population": population,
}
)
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
@ -85,7 +93,10 @@ class PlaceHandler(osmium.SimpleHandler):
if not n.location.valid:
return
lat, lon = n.location.lat, n.location.lon
if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
if not (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
):
return
name = n.tags.get("name:en", n.tags.get("name", ""))
@ -112,27 +123,16 @@ class PlaceHandler(osmium.SimpleHandler):
def main() -> None:
parser = argparse.ArgumentParser(
description="Extract place names from OSM PBF"
)
parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
)
args = parser.parse_args()
if args.pbf and args.pbf.exists():
pbf_file = args.pbf
print(f"Using existing PBF: {pbf_file}")
else:
pbf_file = Path("data/great-britain-latest.osm.pbf")
if not pbf_file.exists():
download_pbf(pbf_file)
else:
print(f"Using cached PBF: {pbf_file}")
pbf_file = args.pbf
print(f"Extracting place nodes: {sorted(PLACE_TYPES)} + railway=station")
with tqdm(
unit=" elements",

View file

@ -1,6 +1,4 @@
import argparse
import tempfile
import urllib.request
from pathlib import Path
from tempfile import mkdtemp
@ -13,8 +11,6 @@ BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20
GEOFABRIK_GB_URL = "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96
UK_BBOX_EAST = 1.68
@ -34,27 +30,6 @@ POI_TAG_KEYS: list[str] = [
]
def download_pbf(pbf_file: Path) -> None:
pbf_file.parent.mkdir(parents=True, exist_ok=True)
tmp = pbf_file.with_suffix(".pbf.tmp")
print(f"Downloading {GEOFABRIK_GB_URL}")
with (
tqdm(unit="B", unit_scale=True, desc="Downloading") as bar,
urllib.request.urlopen(GEOFABRIK_GB_URL) as resp,
open(tmp, "wb") as f,
):
length = resp.headers.get("Content-Length")
if length:
bar.total = int(length)
while chunk := resp.read(1 << 20):
f.write(chunk)
bar.update(len(chunk))
tmp.rename(pbf_file)
print(f"Saved to {pbf_file}")
class POIHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
super().__init__()
@ -130,51 +105,41 @@ def main() -> None:
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
if args.pbf and args.pbf.exists():
pbf_file = args.pbf
print(f"Using provided PBF file at {pbf_file}")
else:
pbf_file = Path(cache_dir) / "great-britain-latest.osm.pbf"
if not pbf_file.exists():
download_pbf(pbf_file)
else:
print(f"Using cached PBF file at {pbf_file}")
pbf_file = args.pbf
print(f"Tag keys: {POI_TAG_KEYS}")
print(f"Tag keys: {POI_TAG_KEYS}")
tmp_dir = Path(mkdtemp(prefix="pois_"))
with tqdm(
unit=" elements",
unit_scale=True,
desc="Streaming",
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = POIHandler(progress, tmp_dir)
handler.apply_file(str(pbf_file), locations=True)
handler._flush_batch() # write any remaining POIs
tmp_dir = Path(mkdtemp(prefix="pois_"))
with tqdm(
unit=" elements",
unit_scale=True,
desc="Streaming",
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = POIHandler(progress, tmp_dir)
handler.apply_file(str(pbf_file), locations=True)
handler._flush_batch() # write any remaining POIs
print(f"Extracted {handler.poi_count:,} POIs")
print(f"Extracted {handler.poi_count:,} POIs")
batch_files = sorted(tmp_dir.glob("batch_*.parquet"))
df = pl.concat([pl.scan_parquet(f) for f in batch_files])
batch_files = sorted(tmp_dir.glob("batch_*.parquet"))
df = pl.concat([pl.scan_parquet(f) for f in batch_files])
# Only keep categories with enough occurrences
valid_categories = (
df.group_by("category")
.agg(pl.len().alias("count"))
.filter(pl.col("count") >= MIN_OCCURENCE_COUNT)
)
df = df.join(valid_categories.select("category"), on="category", how="semi")
# Only keep categories with enough occurrences
valid_categories = (
df.group_by("category")
.agg(pl.len().alias("count"))
.filter(pl.col("count") >= MIN_OCCURENCE_COUNT)
)
df = df.join(valid_categories.select("category"), on="category", how="semi")
print(f"Total POIs: {handler.poi_count:,}")
df.sink_parquet(args.output)
print(f"Saved to {args.output}")
print(f"Total POIs: {handler.poi_count:,}")
df.sink_parquet(args.output)
print(f"Saved to {args.output}")
if __name__ == "__main__":

View file

@ -18,7 +18,7 @@ BEDROOM_SHEETS = {
16: 4, # Four or more Bedrooms
}
# Local authority district codes in England
# Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system
LA_PREFIXES = ("E06", "E07", "E08", "E09")
@ -41,10 +41,9 @@ def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
)
.filter(
pl.col("area_code").is_not_null()
& pl.col("area_code").str.starts_with("E06")
| pl.col("area_code").str.starts_with("E07")
| pl.col("area_code").str.starts_with("E08")
| pl.col("area_code").str.starts_with("E09")
& pl.any_horizontal(
pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES
)
)
.with_columns(
# Suppressed values are ".." — cast will turn them to null