has issues

This commit is contained in:
Andras Schmelczer 2026-05-25 13:20:17 +01:00
parent 2e112d7398
commit c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions

View file

@ -182,6 +182,19 @@ DROP_CATEGORIES = {
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for
# tertiary education, tutoring, and childcare is too noisy/incomplete to be
# useful on a property-search map.
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
}
@ -943,23 +956,10 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"tourism/chalet",
],
),
(
"Education",
"School",
"🏫",
[
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
# Niche/tertiary education amenities that GIAS does not cover are dropped
# rather than mixed in with state-funded schools.
(
"Local Businesses",
"Hotel",
@ -1316,11 +1316,45 @@ def transform_grocery_retail_points(
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata."""
return pl.scan_parquet(gias_path).select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
pl.lit("School").alias("category"),
pl.lit("School").alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
pl.lit("🏫").alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
pl.col("age_range").alias("school_age_range"),
pl.col("gender").alias("school_gender"),
pl.col("religious_character").alias("school_religious_character"),
pl.col("admissions_policy").alias("school_admissions_policy"),
pl.col("nursery_provision").alias("school_nursery_provision"),
pl.col("sixth_form").alias("school_sixth_form"),
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
pl.col("trust").alias("school_trust"),
pl.col("address").alias("school_address"),
pl.col("postcode").alias("school_postcode"),
pl.col("local_authority").alias("school_local_authority"),
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
)
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
grocery_retail_points_path: Path | None = None,
naptan_path: Path,
boundary_path: Path,
grocery_retail_points_path: Path,
gias_path: Path,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
@ -1372,24 +1406,21 @@ def transform(
)
naptan_df = pl.scan_parquet(naptan_path).collect()
if boundary_path is not None:
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
pl.col("category").alias("icon_category"),
)
frames = [lf, naptan]
if grocery_retail_points_path is not None:
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames.append(grocery_pois.lazy())
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]
return pl.concat(frames, how="diagonal_relaxed")
@ -1413,8 +1444,15 @@ def main():
parser.add_argument(
"--grocery-retail-points",
type=Path,
required=True,
help="GEOLYTIX Grocery Retail Points parquet",
)
parser.add_argument(
"--gias",
type=Path,
required=True,
help="GIAS schools register parquet (replaces OSM schools)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
@ -1425,6 +1463,7 @@ def main():
args.naptan,
args.boundary,
args.grocery_retail_points,
args.gias,
).collect(engine="streaming")
df.write_parquet(args.output)