diff --git a/Makefile.data b/Makefile.data
index bfcc286..660b9e1 100644
--- a/Makefile.data
+++ b/Makefile.data
@@ -29,10 +29,11 @@ PROPERTIES_PQ := $(DATA_DIR)/properties.parquet
MERGE_STAMP := $(DATA_DIR)/.merge_done
PRICE_INDEX := $(DATA_DIR)/price_index.parquet
PRICES_STAMP := $(DATA_DIR)/.prices_done
-EPC := $(MANUAL_DATA)/certificates.csv
+EPC := $(MANUAL_DATA)/domestic-csv.zip
ETHNICITY := $(DATA_DIR)/ethnicity_by_la.parquet
CRIME_DIR := $(MANUAL_DATA)/crime
CRIME := $(DATA_DIR)/crime_by_lsoa.parquet
+CRIME_STAMP := $(CRIME_DIR)/.downloaded
NOISE := $(DATA_DIR)/road_noise.parquet
OFSTED := $(DATA_DIR)/ofsted.parquet
NAPTAN := $(DATA_DIR)/naptan.parquet
@@ -65,7 +66,7 @@ PMTILES_VERSION := 1.22.3
.PHONY: prepare merge tiles \
download-arcgis download-price-paid download-deprivation download-ethnicity \
download-naptan download-pois download-grocery-retail-points download-ofsted download-broadband download-rental-prices \
- download-postcodes download-noise download-inspire \
+ download-postcodes download-noise download-inspire download-crime \
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-os-greenspace download-pbf download-places download-lsoa-population download-median-age download-england-boundary download-rightmove-outcodes \
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
transform-school-proximity transform-postcode-boundaries \
@@ -78,6 +79,7 @@ download-arcgis: $(ARCGIS)
download-price-paid: $(PRICE_PAID)
download-deprivation: $(IOD)
download-ethnicity: $(ETHNICITY)
+download-crime: $(CRIME_STAMP)
download-naptan: $(NAPTAN)
download-pois: $(POIS_RAW)
download-grocery-retail-points: $(GROCERY_RETAIL_POINTS)
@@ -121,10 +123,10 @@ $(TILES):
$(EPC):
@echo ""
@echo "=== EPC dataset not found ==="
- @echo "The EPC certificates file is required: $@"
+ @echo "The EPC certificates archive is required: $@"
@echo ""
- @echo "To obtain it, register at https://epc.opendatacommunities.org/login"
- @echo "and place certificates.csv in manual-data/"
+ @echo "To obtain it, register at https://get-energy-performance-data.communities.gov.uk/filter-properties?property_type=domestic"
+ @echo "and place domestic-csv.zip in manual-data/"
@echo ""
@exit 1
@@ -140,6 +142,10 @@ $(IOD):
$(ETHNICITY):
uv run python -m pipeline.download.ethnicity --output $@
+$(CRIME_STAMP):
+ uv run python -m pipeline.download.crime --output $(CRIME_DIR)
+ @touch $@
+
$(NAPTAN):
uv run python -m pipeline.download.naptan --output $@
@@ -216,15 +222,7 @@ $(POIS_FILTERED): $(POIS_RAW) $(NAPTAN) $(GROCERY_RETAIL_POINTS) $(ENGLAND_BOUND
$(EPC_PP): $(PRICE_PAID) $(EPC)
uv run python -m pipeline.transform.join_epc_pp --epc $(EPC) --price-paid $(PRICE_PAID) --output $@
-$(CRIME):
- @if [ ! -d "$(CRIME_DIR)" ]; then \
- echo ""; \
- echo "=== Crime dataset not found ==="; \
- echo "Place police.uk crime CSVs in $(CRIME_DIR)/"; \
- echo "Download from https://data.police.uk/data/"; \
- echo ""; \
- exit 1; \
- fi
+$(CRIME): $(CRIME_STAMP)
uv run python -m pipeline.transform.crime --input $(CRIME_DIR) --output $@
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE)
diff --git a/manual-data/fixed_broadband_coverage.zip b/manual-data/fixed_broadband_coverage.zip
deleted file mode 100644
index b42cf58..0000000
Binary files a/manual-data/fixed_broadband_coverage.zip and /dev/null differ
diff --git a/manual-data/journey_times_bank.parquet b/manual-data/journey_times_bank.parquet
deleted file mode 100644
index fcdf3b6..0000000
Binary files a/manual-data/journey_times_bank.parquet and /dev/null differ
diff --git a/manual-data/journey_times_fitzrovia.parquet b/manual-data/journey_times_fitzrovia.parquet
deleted file mode 100644
index 825614f..0000000
Binary files a/manual-data/journey_times_fitzrovia.parquet and /dev/null differ
diff --git a/pipeline/download/crime.py b/pipeline/download/crime.py
new file mode 100644
index 0000000..adab8e5
--- /dev/null
+++ b/pipeline/download/crime.py
@@ -0,0 +1,393 @@
+"""Download police.uk crime archive ZIPs.
+
+The archive page lists rolling monthly snapshots. Newer snapshots overlap older
+ones, so extraction keeps files already written by newer archives.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import html
+import json
+import re
+import shutil
+import sys
+import zipfile
+from dataclasses import asdict, dataclass
+from datetime import UTC, datetime
+from pathlib import Path, PurePosixPath
+from urllib.parse import urljoin
+
+import httpx
+from tqdm import tqdm
+
+ARCHIVE_URL = "https://data.police.uk/data/archive/"
+ARCHIVE_LINK_RE = re.compile(
+ r'
\s*.*?'
+ r'
'
+ r"(?P\s*\((?P
[^)]+)\)\s*"
+ r'\s*(?P.*?)\s*
\s*'
+ r'(?P.*?)
',
+ re.DOTALL,
+)
+VALID_MD5_RE = re.compile(r"^[0-9a-fA-F]{32}$")
+MONTH_RE = re.compile(r"^\d{4}-\d{2}$")
+
+
+@dataclass(frozen=True)
+class CrimeArchive:
+ month: str
+ label: str
+ url: str
+ filename: str
+ size: str
+ contained_range: str
+ md5: str | None
+ raw_md5: str
+
+
+def _clean_text(value: str) -> str:
+ text = re.sub(r"<[^>]+>", " ", value)
+ return re.sub(r"\s+", " ", html.unescape(text)).strip()
+
+
+def parse_archives(page_html: str, base_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
+ """Parse monthly crime archive links from the police.uk archive page."""
+ archives: list[CrimeArchive] = []
+ for match in ARCHIVE_LINK_RE.finditer(page_html):
+ raw_md5 = _clean_text(match.group("md5")).lower()
+ md5 = raw_md5 if VALID_MD5_RE.fullmatch(raw_md5) else None
+ href = html.unescape(match.group("href"))
+ archives.append(
+ CrimeArchive(
+ month=match.group("month"),
+ label=_clean_text(match.group("label")),
+ url=urljoin(base_url, href),
+ filename=Path(href).name,
+ size=_clean_text(match.group("size")),
+ contained_range=_clean_text(match.group("contained_range")),
+ md5=md5,
+ raw_md5=raw_md5,
+ )
+ )
+
+ return archives
+
+
+def fetch_archives(archive_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
+ """Fetch and parse the archive index."""
+ with httpx.Client(
+ follow_redirects=True,
+ timeout=httpx.Timeout(30.0, read=60.0),
+ headers={"User-Agent": "perfect-postcode-data-pipeline/1.0"},
+ ) as client:
+ response = client.get(archive_url)
+ response.raise_for_status()
+
+ archives = parse_archives(response.text, archive_url)
+ if not archives:
+ raise RuntimeError(f"No monthly archive ZIPs found at {archive_url}")
+ return archives
+
+
+def filter_archives(
+ archives: list[CrimeArchive],
+ *,
+ from_month: str | None = None,
+ to_month: str | None = None,
+ limit: int | None = None,
+) -> list[CrimeArchive]:
+ """Filter archives by inclusive YYYY-MM bounds while preserving page order."""
+ filtered = [
+ archive
+ for archive in archives
+ if (from_month is None or archive.month >= from_month)
+ and (to_month is None or archive.month <= to_month)
+ ]
+ if limit is not None:
+ filtered = filtered[:limit]
+ return filtered
+
+
+def file_md5(path: Path) -> str:
+ digest = hashlib.md5()
+ with path.open("rb") as file:
+ for chunk in iter(lambda: file.read(1024 * 1024), b""):
+ digest.update(chunk)
+ return digest.hexdigest()
+
+
+def download_archive(
+ archive: CrimeArchive,
+ archive_dir: Path,
+ *,
+ verify: bool,
+ force: bool,
+ timeout: float,
+) -> Path:
+ """Download one archive ZIP, resuming an existing .part file when possible."""
+ dest = archive_dir / archive.filename
+ partial = dest.with_suffix(dest.suffix + ".part")
+
+ if force:
+ dest.unlink(missing_ok=True)
+ partial.unlink(missing_ok=True)
+
+ if dest.exists():
+ if verify and archive.md5 is not None:
+ actual_md5 = file_md5(dest)
+ if actual_md5 == archive.md5:
+ print(f"{archive.filename}: already downloaded")
+ return dest
+ print(
+ f"{archive.filename}: checksum mismatch, downloading again",
+ file=sys.stderr,
+ )
+ dest.unlink()
+ partial.unlink(missing_ok=True)
+ else:
+ print(f"{archive.filename}: already downloaded")
+ return dest
+
+ resume_from = partial.stat().st_size if partial.exists() else 0
+ headers = {"Range": f"bytes={resume_from}-"} if resume_from else {}
+
+ with httpx.stream(
+ "GET",
+ archive.url,
+ headers=headers,
+ follow_redirects=True,
+ timeout=httpx.Timeout(30.0, read=timeout),
+ ) as response:
+ if response.status_code == 206 and resume_from:
+ mode = "ab"
+ initial = resume_from
+ else:
+ response.raise_for_status()
+ mode = "wb"
+ initial = 0
+
+ total_header = int(response.headers.get("content-length", 0))
+ total = initial + total_header if total_header else None
+ with (
+ partial.open(mode) as output,
+ tqdm(
+ total=total,
+ initial=initial,
+ unit="B",
+ unit_scale=True,
+ unit_divisor=1024,
+ desc=archive.filename,
+ ) as progress,
+ ):
+ for chunk in response.iter_bytes(chunk_size=1024 * 1024):
+ output.write(chunk)
+ progress.update(len(chunk))
+
+ partial.replace(dest)
+
+ if verify and archive.md5 is not None:
+ actual_md5 = file_md5(dest)
+ if actual_md5 != archive.md5:
+ dest.unlink(missing_ok=True)
+ raise RuntimeError(
+ f"{archive.filename}: MD5 mismatch: expected {archive.md5}, got {actual_md5}"
+ )
+
+ return dest
+
+
+def _safe_csv_members(
+ archive: zipfile.ZipFile,
+) -> list[tuple[zipfile.ZipInfo, PurePosixPath]]:
+ members: list[tuple[zipfile.ZipInfo, PurePosixPath]] = []
+ for info in archive.infolist():
+ rel_path = PurePosixPath(info.filename)
+ if (
+ info.is_dir()
+ or rel_path.is_absolute()
+ or ".." in rel_path.parts
+ or rel_path.suffix.lower() != ".csv"
+ ):
+ continue
+ members.append((info, rel_path))
+ return members
+
+
+def extract_csvs(
+ zip_path: Path,
+ output_dir: Path,
+ *,
+ overwrite: bool = False,
+) -> tuple[int, int]:
+ """Extract CSVs from one ZIP. Returns (extracted, skipped)."""
+ extracted = 0
+ skipped = 0
+
+ with zipfile.ZipFile(zip_path) as archive:
+ for info, rel_path in _safe_csv_members(archive):
+ dest = output_dir.joinpath(*rel_path.parts)
+ if dest.exists() and not overwrite:
+ skipped += 1
+ continue
+
+ dest.parent.mkdir(parents=True, exist_ok=True)
+ with archive.open(info) as source, dest.open("wb") as target:
+ shutil.copyfileobj(source, target)
+ extracted += 1
+
+ return extracted, skipped
+
+
+def write_manifest(
+ output_dir: Path, archive_url: str, archives: list[CrimeArchive]
+) -> None:
+ manifest = {
+ "source": archive_url,
+ "fetched_at": datetime.now(UTC).isoformat(),
+ "archives": [asdict(archive) for archive in archives],
+ }
+ path = output_dir / "archive_manifest.json"
+ path.write_text(json.dumps(manifest, indent=2) + "\n")
+
+
+def _month_arg(value: str) -> str:
+ if not MONTH_RE.fullmatch(value):
+ raise argparse.ArgumentTypeError("month must be in YYYY-MM format")
+ return value
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Download all monthly police.uk crime archive ZIPs"
+ )
+ parser.add_argument(
+ "--output",
+ type=Path,
+ required=True,
+ help="Directory for extracted CSVs; ZIPs are kept under _archives/",
+ )
+ parser.add_argument(
+ "--archive-url",
+ default=ARCHIVE_URL,
+ help=f"Archive index URL (default: {ARCHIVE_URL})",
+ )
+ parser.add_argument(
+ "--from-month",
+ type=_month_arg,
+ help="Only download archives from this YYYY-MM onwards",
+ )
+ parser.add_argument(
+ "--to-month",
+ type=_month_arg,
+ help="Only download archives up to this YYYY-MM",
+ )
+ parser.add_argument(
+ "--limit",
+ type=int,
+ help="Download at most this many archives after filtering",
+ )
+ parser.add_argument(
+ "--list",
+ action="store_true",
+ help="Print the archive URLs that would be downloaded and exit",
+ )
+ parser.add_argument(
+ "--no-extract",
+ dest="extract",
+ action="store_false",
+ help="Download ZIPs only; do not extract CSVs",
+ )
+ parser.add_argument(
+ "--overwrite-extracted",
+ action="store_true",
+ help="Overwrite CSVs when extracting overlapping archive snapshots",
+ )
+ parser.add_argument(
+ "--no-verify",
+ dest="verify",
+ action="store_false",
+ help="Skip MD5 verification",
+ )
+ parser.add_argument(
+ "--force",
+ action="store_true",
+ help="Redownload archives even if ZIP files already exist",
+ )
+ parser.add_argument(
+ "--timeout",
+ type=float,
+ default=600.0,
+ help="Per-read timeout in seconds for large ZIP downloads",
+ )
+ args = parser.parse_args()
+
+ print("Fetching police.uk archive index...")
+ archives = filter_archives(
+ fetch_archives(args.archive_url),
+ from_month=args.from_month,
+ to_month=args.to_month,
+ limit=args.limit,
+ )
+ if not archives:
+ raise SystemExit("No archives matched the requested filters")
+
+ bad_md5 = [
+ archive.filename for archive in archives if archive.raw_md5 and not archive.md5
+ ]
+ if bad_md5:
+ print(
+ "Warning: ignoring malformed MD5 values for "
+ + ", ".join(bad_md5[:5])
+ + ("..." if len(bad_md5) > 5 else ""),
+ file=sys.stderr,
+ )
+
+ print(f"Found {len(archives)} monthly archive ZIPs")
+ if args.list:
+ for archive in archives:
+ print(f"{archive.month}\t{archive.url}\t{archive.raw_md5}")
+ return
+
+ args.output.mkdir(parents=True, exist_ok=True)
+ archive_dir = args.output / "_archives"
+ archive_dir.mkdir(parents=True, exist_ok=True)
+ write_manifest(args.output, args.archive_url, archives)
+
+ total_extracted = 0
+ total_skipped = 0
+ for index, archive in enumerate(archives, start=1):
+ print(f"[{index}/{len(archives)}] {archive.label} ({archive.size})")
+ zip_path = download_archive(
+ archive,
+ archive_dir,
+ verify=args.verify,
+ force=args.force,
+ timeout=args.timeout,
+ )
+ if args.extract:
+ extracted, skipped = extract_csvs(
+ zip_path,
+ args.output,
+ overwrite=args.overwrite_extracted,
+ )
+ total_extracted += extracted
+ total_skipped += skipped
+ print(
+ f"{archive.filename}: extracted {extracted} CSVs"
+ + (f", skipped {skipped} existing CSVs" if skipped else "")
+ )
+
+ if args.extract:
+ print(
+ f"Done. ZIPs saved in {archive_dir}; extracted {total_extracted} CSVs"
+ + (f" and skipped {total_skipped} existing CSVs" if total_skipped else "")
+ + "."
+ )
+ else:
+ print(f"Done. ZIPs saved in {archive_dir}.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pipeline/download/map_assets.py b/pipeline/download/map_assets.py
index 1de7d84..9456177 100644
--- a/pipeline/download/map_assets.py
+++ b/pipeline/download/map_assets.py
@@ -1,9 +1,15 @@
import argparse
+import base64
+import json
+import re
import sys
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
from pathlib import Path
+from PIL import Image, ImageDraw
+
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
@@ -14,53 +20,80 @@ POI_ICON_BASE = "https://geolytix.github.io/MapIcons"
# Font stacks used by @protomaps/basemaps with lang='en'
FONT_STACKS = ["Noto Sans Regular", "Noto Sans Italic", "Noto Sans Medium"]
-# Fallback emoji not in any category
-_FALLBACK_EMOJIS = ["📍"]
-
POI_ICON_PATHS = [
- "asda/asda_express_24px.svg",
- "asda/asda_green_basket_24px.svg",
- "asda/asda_green_trolley_24px.svg",
- "asda/asda_living_24px.svg",
- "asda/asda_pfs_24px.svg",
- "asda/asda_primary.svg",
- "asda/asda_superstore_green_trolley_24px.svg",
- "brands/aldi_24px.svg",
- "brands/amazon_fresh_alt_24px.svg",
- "brands/booths_24px.svg",
- "brands/budgens_24px.svg",
- "brands/centra_24px.svg",
- "brands/cook.svg",
- "brands/coop_24px.svg",
- "brands/costco_24px.svg",
- "brands/dunnes_stores_24px.svg",
- "brands/farmfoods_updated_24px.svg",
- "brands/heron_24px.svg",
- "brands/iceland_24px.svg",
- "brands/iceland_food_warehouse_24px.svg",
- "brands/lidl_24px.svg",
- "brands/little_waitrose_24px.svg",
- "brands/makro_24px.svg",
- "brands/mns_24px.svg",
- "brands/mns_food_24px.svg",
- "brands/mns_high_street_24px.svg",
- "brands/mns_hospital_24px.svg",
- "brands/mns_moto_24px.svg",
- "brands/mns_outlet_24px.svg",
- "brands/morrisons_24px.svg",
- "brands/morrisons_daily_24px.svg",
- "brands/sainsburys_24px.svg",
- "brands/sainsburys_local_24px.svg",
- "brands/spar_24px.svg",
- "brands/tesco_24px.svg",
- "brands/tesco_express_24px.svg",
- "brands/tesco_extra_24px.svg",
- "brands/waitrose_24px.svg",
- "brands/wholefoods_24px.svg",
- "logos/planet_organic_24px.svg",
+ "brands_2023/supermarkets/farmfoods.svg",
+ "brands_2023/supermarkets/heron_foods.svg",
+ "brands_2023/supermarkets/little_waitrose.svg",
+ "brands_2024/amazon_fresh.svg",
+ "brands_2024/booths.svg",
+ "brands_2024/budgens.svg",
+ "brands_2024/cook.svg",
+ "brands_2024/dunnes_stores.svg",
+ "brands_2024/iceland.svg",
+ "brands_2024/makro.svg",
+ "brands_2024/mns.svg",
+ "brands_2024/morrisons_daily.svg",
+ "brands_2024/sainsburys_local.svg",
+ "brands_2024/wholefoods.svg",
+ "logos/aldi.svg",
+ "logos/asda.svg",
+ "logos/centra.svg",
+ "logos/coop.svg",
+ "logos/lidl.svg",
+ "logos/morrisons.svg",
+ "logos/planet_organic.svg",
+ "logos/sainsburys.svg",
+ "logos/spar.svg",
+ "logos/tesco.svg",
+ "logos/tesco_express.svg",
+ "logos/tesco_extra.svg",
+ "logos/waitrose.svg",
"public_transport/london_tube.svg",
+ "visuals/mns.svg",
]
+DERIVED_POI_ICON_PATHS = [
+ ("costco_logo", "brands/costco.svg", "logos/costco.svg"),
+ (
+ "embedded_png",
+ "brands/iceland_food_warehouse_24px.svg",
+ "logos/the_food_warehouse.png",
+ ),
+]
+
+POI_ICON_SVG_CROPS = {
+ "brands_2023/supermarkets/farmfoods.svg": (1.293, 7.314, 15.48, 3.293),
+ "brands_2023/supermarkets/heron_foods.svg": (0.062, 6.68, 17.995, 5.325),
+ "brands_2023/supermarkets/little_waitrose.svg": (0.916, 5.645, 16.365, 6.719),
+ "brands_2024/amazon_fresh.svg": (3.817, 1.646, 16.367, 16.358),
+ "brands_2024/booths.svg": (1.456, 7.143, 15.313, 3.512),
+ "brands_2024/budgens.svg": (2.251, 2.278, 13.6, 13.612),
+ "brands_2024/cook.svg": (5.028, 5.493, 13.945, 9.648),
+ "brands_2024/dunnes_stores.svg": (4.375, 7.732, 15.249, 5.055),
+ "brands_2024/iceland.svg": (1.136, 6.823, 16.067, 4.302),
+ "brands_2024/makro.svg": (4.411, 6.098, 16.397, 5.428),
+ "brands_2024/mns.svg": (4.042, 6.986, 16.171, 6.724),
+ "brands_2024/morrisons_daily.svg": (3.341, 4.414, 17.317, 8.248),
+ "brands_2024/sainsburys_local.svg": (4.58, 1.61, 14.84, 14.849),
+ "brands_2024/wholefoods.svg": (4.17, 2.193, 15.659, 15.668),
+ "logos/aldi.svg": (4.813, 2.563, 14.374, 14.383),
+ "logos/asda.svg": (3.91, 7.135, 16.181, 5.442),
+ "logos/centra.svg": (3.36, 7.35, 17.28, 4.651),
+ "logos/coop.svg": (6.407, 4.658, 11.187, 11.793),
+ "logos/costco.svg": (70.61, 144.908, 256.67, 85.825),
+ "logos/lidl.svg": (4.938, 2.973, 13.985, 13.985),
+ "logos/morrisons.svg": (5.231, 2.985, 13.538, 13.398),
+ "logos/planet_organic.svg": (5.528, 3.564, 12.943, 12.943),
+ "logos/sainsburys.svg": (7.502, 3.572, 8.996, 12.646),
+ "logos/spar.svg": (4.933, 2.968, 14.133, 13.853),
+ "logos/tesco.svg": (4.338, 6.865, 15.324, 5.359),
+ "logos/tesco_express.svg": (5.231, 5.933, 13.538, 8.345),
+ "logos/tesco_extra.svg": (4.933, 5.775, 14.133, 8.519),
+ "logos/waitrose.svg": (5.528, 6.09, 12.943, 9.855),
+}
+
+POI_ICON_SVG_INTRINSIC_MAX = 512
+
def collect_twemoji_codes() -> list[str]:
"""Derive twemoji hex codes from transform_poi categories.
@@ -76,9 +109,6 @@ def collect_twemoji_codes() -> list[str]:
for emoji in NAPTAN_EMOJIS.values():
emojis.add(emoji)
- for emoji in _FALLBACK_EMOJIS:
- emojis.add(emoji)
-
# First codepoint hex, matching frontend logic
return sorted({f"{ord(e[0]):x}" for e in emojis})
@@ -97,6 +127,214 @@ def download_file(url: str, dest: Path) -> tuple[bool, str]:
return False, url
+def download_text(url: str) -> str:
+ with urllib.request.urlopen(url) as response:
+ return response.read().decode("utf-8")
+
+
+def build_costco_logo(marker_svg: str) -> str:
+ start = marker_svg.find('")
+ if start < 0 or end < 0:
+ raise ValueError("Costco marker SVG layout changed")
+
+ logo_group = marker_svg[start : end + 4]
+ return (
+ '\n'
+ '\n"
+ )
+
+
+def trim_white_png(png_bytes: bytes) -> bytes:
+ image = Image.open(BytesIO(png_bytes)).convert("RGBA")
+ pixels = image.load()
+
+ for y in range(image.height):
+ for x in range(image.width):
+ red, green, blue, alpha = pixels[x, y]
+ if red > 245 and green > 245 and blue > 245:
+ pixels[x, y] = (red, green, blue, 0)
+
+ alpha_box = image.getchannel("A").getbbox()
+ if alpha_box:
+ image = image.crop(alpha_box)
+
+ out = BytesIO()
+ image.save(out, format="PNG")
+ return out.getvalue()
+
+
+def extract_embedded_png(marker_svg: str) -> bytes:
+ match = re.search(r"base64,([^\"']+)", marker_svg)
+ if not match:
+ raise ValueError("POI marker SVG did not contain an embedded PNG")
+ return trim_white_png(base64.b64decode(match.group(1)))
+
+
+def svg_intrinsic_size(width: float, height: float) -> tuple[int, int]:
+ if width <= 0 or height <= 0:
+ return (POI_ICON_SVG_INTRINSIC_MAX, POI_ICON_SVG_INTRINSIC_MAX)
+ if width >= height:
+ return (
+ POI_ICON_SVG_INTRINSIC_MAX,
+ max(1, round(POI_ICON_SVG_INTRINSIC_MAX * height / width)),
+ )
+ return (
+ max(1, round(POI_ICON_SVG_INTRINSIC_MAX * width / height)),
+ POI_ICON_SVG_INTRINSIC_MAX,
+ )
+
+
+def set_svg_geometry(svg_text: str, crop: tuple[float, float, float, float]) -> str:
+ x, y, width, height = crop
+ view_box = f"{x:g} {y:g} {width:g} {height:g}"
+ intrinsic_width, intrinsic_height = svg_intrinsic_size(width, height)
+
+ svg_text = re.sub(r'viewBox="[^"]+"', f'viewBox="{view_box}"', svg_text, count=1)
+ if 'viewBox="' not in svg_text:
+ svg_text = re.sub(r"