England only

This commit is contained in:
Andras Schmelczer 2026-03-15 14:03:38 +00:00
parent 4d08f5d08d
commit 02712f41e8
8 changed files with 294 additions and 60 deletions

View file

@ -3,6 +3,8 @@ from pathlib import Path
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# Street furniture & infrastructure
@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = {
}
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
if unmapped:
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
raise ValueError(
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
)
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
naptan = pl.scan_parquet(naptan_path).with_columns(
naptan_df = pl.scan_parquet(naptan_path).collect()
if boundary_path is not None:
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
)
@ -1122,12 +1134,18 @@ def main():
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input, args.naptan).collect(engine="streaming")
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
df.write_parquet(args.output)