diff --git a/pipeline/transform/transform_poi.py b/pipeline/transform/transform_poi.py index 157c548..3b1257f 100644 --- a/pipeline/transform/transform_poi.py +++ b/pipeline/transform/transform_poi.py @@ -1712,13 +1712,13 @@ def transform( lf.select("category").unique().collect(engine="streaming").to_series().to_list() ) - # Verify every non-dropped category has a mapping + # Warn about (and ignore) any category lacking a mapping unmapped = [] for cat in all_categories: if cat not in DROP_CATEGORIES and cat not in CATEGORY_MAP: unmapped.append(cat) if unmapped: - raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}") + print(f"Ignoring categories missing from CATEGORY_MAP: {sorted(unmapped)}") # Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts) mapped_but_absent = [] @@ -1731,8 +1731,8 @@ def transform( f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}" ) - # Drop unwanted categories - lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES))) + # Drop unwanted and unmapped categories + lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES) + unmapped)) # Drop UNNAMED instances of private-dominated tags (gardens, pitches, # pools) so they don't inflate Park / Sports Centre proximity counts. Done