lmao
This commit is contained in:
parent
03445188ea
commit
524580eb25
102 changed files with 36625 additions and 1295 deletions
|
|
@ -12,7 +12,7 @@ import osmium
|
|||
import polars as pl
|
||||
from tqdm import tqdm
|
||||
|
||||
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST, download_pbf
|
||||
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
|
||||
|
||||
PLACE_TYPES = {
|
||||
"city",
|
||||
|
|
@ -74,9 +74,17 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
self._progress = progress
|
||||
self.places: list[dict] = []
|
||||
|
||||
def _add(self, name: str, place_type: str, lat: float, lon: float, population: int) -> None:
|
||||
def _add(
|
||||
self, name: str, place_type: str, lat: float, lon: float, population: int
|
||||
) -> None:
|
||||
self.places.append(
|
||||
{"name": name, "place_type": place_type, "lat": lat, "lon": lon, "population": population}
|
||||
{
|
||||
"name": name,
|
||||
"place_type": place_type,
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"population": population,
|
||||
}
|
||||
)
|
||||
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
|
||||
|
||||
|
|
@ -85,7 +93,10 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
if not n.location.valid:
|
||||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
|
||||
if not (
|
||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
||||
):
|
||||
return
|
||||
|
||||
name = n.tags.get("name:en", n.tags.get("name", ""))
|
||||
|
|
@ -112,27 +123,16 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract place names from OSM PBF"
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.pbf and args.pbf.exists():
|
||||
pbf_file = args.pbf
|
||||
print(f"Using existing PBF: {pbf_file}")
|
||||
else:
|
||||
pbf_file = Path("data/great-britain-latest.osm.pbf")
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
else:
|
||||
print(f"Using cached PBF: {pbf_file}")
|
||||
|
||||
pbf_file = args.pbf
|
||||
print(f"Extracting place nodes: {sorted(PLACE_TYPES)} + railway=station")
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
|
|
|
|||
|
|
@ -1,6 +1,4 @@
|
|||
import argparse
|
||||
import tempfile
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
|
||||
|
|
@ -13,8 +11,6 @@ BATCH_SIZE = 50_000
|
|||
|
||||
MIN_OCCURENCE_COUNT = 20
|
||||
|
||||
GEOFABRIK_GB_URL = "https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
|
||||
|
||||
UK_BBOX_WEST = -7.57
|
||||
UK_BBOX_SOUTH = 49.96
|
||||
UK_BBOX_EAST = 1.68
|
||||
|
|
@ -34,27 +30,6 @@ POI_TAG_KEYS: list[str] = [
|
|||
]
|
||||
|
||||
|
||||
def download_pbf(pbf_file: Path) -> None:
|
||||
pbf_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = pbf_file.with_suffix(".pbf.tmp")
|
||||
print(f"Downloading {GEOFABRIK_GB_URL}")
|
||||
|
||||
with (
|
||||
tqdm(unit="B", unit_scale=True, desc="Downloading") as bar,
|
||||
urllib.request.urlopen(GEOFABRIK_GB_URL) as resp,
|
||||
open(tmp, "wb") as f,
|
||||
):
|
||||
length = resp.headers.get("Content-Length")
|
||||
if length:
|
||||
bar.total = int(length)
|
||||
while chunk := resp.read(1 << 20):
|
||||
f.write(chunk)
|
||||
bar.update(len(chunk))
|
||||
|
||||
tmp.rename(pbf_file)
|
||||
print(f"Saved to {pbf_file}")
|
||||
|
||||
|
||||
class POIHandler(osmium.SimpleHandler):
|
||||
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
|
||||
super().__init__()
|
||||
|
|
@ -130,51 +105,41 @@ def main() -> None:
|
|||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with tempfile.TemporaryDirectory() as cache_dir:
|
||||
if args.pbf and args.pbf.exists():
|
||||
pbf_file = args.pbf
|
||||
print(f"Using provided PBF file at {pbf_file}")
|
||||
else:
|
||||
pbf_file = Path(cache_dir) / "great-britain-latest.osm.pbf"
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
else:
|
||||
print(f"Using cached PBF file at {pbf_file}")
|
||||
pbf_file = args.pbf
|
||||
print(f"Tag keys: {POI_TAG_KEYS}")
|
||||
|
||||
print(f"Tag keys: {POI_TAG_KEYS}")
|
||||
tmp_dir = Path(mkdtemp(prefix="pois_"))
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
unit_scale=True,
|
||||
desc="Streaming",
|
||||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = POIHandler(progress, tmp_dir)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
handler._flush_batch() # write any remaining POIs
|
||||
|
||||
tmp_dir = Path(mkdtemp(prefix="pois_"))
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
unit_scale=True,
|
||||
desc="Streaming",
|
||||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = POIHandler(progress, tmp_dir)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
handler._flush_batch() # write any remaining POIs
|
||||
print(f"Extracted {handler.poi_count:,} POIs")
|
||||
|
||||
print(f"Extracted {handler.poi_count:,} POIs")
|
||||
batch_files = sorted(tmp_dir.glob("batch_*.parquet"))
|
||||
df = pl.concat([pl.scan_parquet(f) for f in batch_files])
|
||||
|
||||
batch_files = sorted(tmp_dir.glob("batch_*.parquet"))
|
||||
df = pl.concat([pl.scan_parquet(f) for f in batch_files])
|
||||
# Only keep categories with enough occurrences
|
||||
valid_categories = (
|
||||
df.group_by("category")
|
||||
.agg(pl.len().alias("count"))
|
||||
.filter(pl.col("count") >= MIN_OCCURENCE_COUNT)
|
||||
)
|
||||
df = df.join(valid_categories.select("category"), on="category", how="semi")
|
||||
|
||||
# Only keep categories with enough occurrences
|
||||
valid_categories = (
|
||||
df.group_by("category")
|
||||
.agg(pl.len().alias("count"))
|
||||
.filter(pl.col("count") >= MIN_OCCURENCE_COUNT)
|
||||
)
|
||||
df = df.join(valid_categories.select("category"), on="category", how="semi")
|
||||
|
||||
print(f"Total POIs: {handler.poi_count:,}")
|
||||
df.sink_parquet(args.output)
|
||||
print(f"Saved to {args.output}")
|
||||
print(f"Total POIs: {handler.poi_count:,}")
|
||||
df.sink_parquet(args.output)
|
||||
print(f"Saved to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ BEDROOM_SHEETS = {
|
|||
16: 4, # Four or more Bedrooms
|
||||
}
|
||||
|
||||
# Local authority district codes in England
|
||||
# Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system
|
||||
LA_PREFIXES = ("E06", "E07", "E08", "E09")
|
||||
|
||||
|
||||
|
|
@ -41,10 +41,9 @@ def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
|
|||
)
|
||||
.filter(
|
||||
pl.col("area_code").is_not_null()
|
||||
& pl.col("area_code").str.starts_with("E06")
|
||||
| pl.col("area_code").str.starts_with("E07")
|
||||
| pl.col("area_code").str.starts_with("E08")
|
||||
| pl.col("area_code").str.starts_with("E09")
|
||||
& pl.any_horizontal(
|
||||
pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES
|
||||
)
|
||||
)
|
||||
.with_columns(
|
||||
# Suppressed values are ".." — cast will turn them to null
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue