England only
This commit is contained in:
parent
4d08f5d08d
commit
02712f41e8
8 changed files with 294 additions and 60 deletions
|
|
@ -3,6 +3,8 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.england_geometry import in_england_mask
|
||||
|
||||
|
||||
DROP_CATEGORIES = {
|
||||
# Street furniture & infrastructure
|
||||
|
|
@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
|||
}
|
||||
|
||||
|
||||
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path | None = None,
|
||||
boundary_path: Path | None = None,
|
||||
) -> pl.LazyFrame:
|
||||
lf = pl.scan_parquet(input_path)
|
||||
|
||||
# Get all unique categories present in the data
|
||||
|
|
@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
|||
if unmapped:
|
||||
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
|
||||
|
||||
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
|
||||
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
|
||||
mapped_but_absent = []
|
||||
all_set = set(all_categories)
|
||||
for cat in CATEGORY_MAP:
|
||||
if cat not in all_set:
|
||||
mapped_but_absent.append(cat)
|
||||
if mapped_but_absent:
|
||||
raise ValueError(
|
||||
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
|
||||
)
|
||||
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
|
||||
|
||||
# Drop unwanted categories
|
||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||
|
|
@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
|||
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
|
||||
)
|
||||
|
||||
naptan = pl.scan_parquet(naptan_path).with_columns(
|
||||
naptan_df = pl.scan_parquet(naptan_path).collect()
|
||||
if boundary_path is not None:
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
naptan_df["lat"].to_numpy(),
|
||||
naptan_df["lng"].to_numpy(),
|
||||
)
|
||||
naptan_df = naptan_df.filter(pl.Series(mask))
|
||||
naptan = naptan_df.lazy().with_columns(
|
||||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||||
pl.lit("Public Transport").alias("group"),
|
||||
)
|
||||
|
|
@ -1122,12 +1134,18 @@ def main():
|
|||
parser.add_argument(
|
||||
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df = transform(args.input, args.naptan).collect(engine="streaming")
|
||||
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
|
||||
|
||||
df.write_parquet(args.output)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue