perfect-postcode/pipeline/download/crime.py

393 lines
12 KiB
Python

"""Download police.uk crime archive ZIPs.
The archive page lists rolling monthly snapshots. Newer snapshots overlap older
ones, so extraction keeps files already written by newer archives.
"""
from __future__ import annotations
import argparse
import hashlib
import html
import json
import re
import shutil
import sys
import zipfile
from dataclasses import asdict, dataclass
from datetime import UTC, datetime
from pathlib import Path, PurePosixPath
from urllib.parse import urljoin
import httpx
from tqdm import tqdm
ARCHIVE_URL = "https://data.police.uk/data/archive/"
ARCHIVE_LINK_RE = re.compile(
r'<div class="download">\s*.*?'
r'<a href="(?P<href>/data/archive/(?P<month>\d{4}-\d{2})\.zip)">'
r"(?P<label>[^<]+)</a>\s*\((?P<size>[^)]+)\)</span>\s*"
r'<p class="contained-range">\s*(?P<contained_range>.*?)\s*</p>\s*'
r'<p class="md5sum">(?P<md5>.*?)</p>',
re.DOTALL,
)
VALID_MD5_RE = re.compile(r"^[0-9a-fA-F]{32}$")
MONTH_RE = re.compile(r"^\d{4}-\d{2}$")
@dataclass(frozen=True)
class CrimeArchive:
month: str
label: str
url: str
filename: str
size: str
contained_range: str
md5: str | None
raw_md5: str
def _clean_text(value: str) -> str:
text = re.sub(r"<[^>]+>", " ", value)
return re.sub(r"\s+", " ", html.unescape(text)).strip()
def parse_archives(page_html: str, base_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
"""Parse monthly crime archive links from the police.uk archive page."""
archives: list[CrimeArchive] = []
for match in ARCHIVE_LINK_RE.finditer(page_html):
raw_md5 = _clean_text(match.group("md5")).lower()
md5 = raw_md5 if VALID_MD5_RE.fullmatch(raw_md5) else None
href = html.unescape(match.group("href"))
archives.append(
CrimeArchive(
month=match.group("month"),
label=_clean_text(match.group("label")),
url=urljoin(base_url, href),
filename=Path(href).name,
size=_clean_text(match.group("size")),
contained_range=_clean_text(match.group("contained_range")),
md5=md5,
raw_md5=raw_md5,
)
)
return archives
def fetch_archives(archive_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
"""Fetch and parse the archive index."""
with httpx.Client(
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=60.0),
headers={"User-Agent": "perfect-postcode-data-pipeline/1.0"},
) as client:
response = client.get(archive_url)
response.raise_for_status()
archives = parse_archives(response.text, archive_url)
if not archives:
raise RuntimeError(f"No monthly archive ZIPs found at {archive_url}")
return archives
def filter_archives(
archives: list[CrimeArchive],
*,
from_month: str | None = None,
to_month: str | None = None,
limit: int | None = None,
) -> list[CrimeArchive]:
"""Filter archives by inclusive YYYY-MM bounds while preserving page order."""
filtered = [
archive
for archive in archives
if (from_month is None or archive.month >= from_month)
and (to_month is None or archive.month <= to_month)
]
if limit is not None:
filtered = filtered[:limit]
return filtered
def file_md5(path: Path) -> str:
digest = hashlib.md5()
with path.open("rb") as file:
for chunk in iter(lambda: file.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def download_archive(
archive: CrimeArchive,
archive_dir: Path,
*,
verify: bool,
force: bool,
timeout: float,
) -> Path:
"""Download one archive ZIP, resuming an existing .part file when possible."""
dest = archive_dir / archive.filename
partial = dest.with_suffix(dest.suffix + ".part")
if force:
dest.unlink(missing_ok=True)
partial.unlink(missing_ok=True)
if dest.exists():
if verify and archive.md5 is not None:
actual_md5 = file_md5(dest)
if actual_md5 == archive.md5:
print(f"{archive.filename}: already downloaded")
return dest
print(
f"{archive.filename}: checksum mismatch, downloading again",
file=sys.stderr,
)
dest.unlink()
partial.unlink(missing_ok=True)
else:
print(f"{archive.filename}: already downloaded")
return dest
resume_from = partial.stat().st_size if partial.exists() else 0
headers = {"Range": f"bytes={resume_from}-"} if resume_from else {}
with httpx.stream(
"GET",
archive.url,
headers=headers,
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=timeout),
) as response:
if response.status_code == 206 and resume_from:
mode = "ab"
initial = resume_from
else:
response.raise_for_status()
mode = "wb"
initial = 0
total_header = int(response.headers.get("content-length", 0))
total = initial + total_header if total_header else None
with (
partial.open(mode) as output,
tqdm(
total=total,
initial=initial,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc=archive.filename,
) as progress,
):
for chunk in response.iter_bytes(chunk_size=1024 * 1024):
output.write(chunk)
progress.update(len(chunk))
partial.replace(dest)
if verify and archive.md5 is not None:
actual_md5 = file_md5(dest)
if actual_md5 != archive.md5:
dest.unlink(missing_ok=True)
raise RuntimeError(
f"{archive.filename}: MD5 mismatch: expected {archive.md5}, got {actual_md5}"
)
return dest
def _safe_csv_members(
archive: zipfile.ZipFile,
) -> list[tuple[zipfile.ZipInfo, PurePosixPath]]:
members: list[tuple[zipfile.ZipInfo, PurePosixPath]] = []
for info in archive.infolist():
rel_path = PurePosixPath(info.filename)
if (
info.is_dir()
or rel_path.is_absolute()
or ".." in rel_path.parts
or rel_path.suffix.lower() != ".csv"
):
continue
members.append((info, rel_path))
return members
def extract_csvs(
zip_path: Path,
output_dir: Path,
*,
overwrite: bool = False,
) -> tuple[int, int]:
"""Extract CSVs from one ZIP. Returns (extracted, skipped)."""
extracted = 0
skipped = 0
with zipfile.ZipFile(zip_path) as archive:
for info, rel_path in _safe_csv_members(archive):
dest = output_dir.joinpath(*rel_path.parts)
if dest.exists() and not overwrite:
skipped += 1
continue
dest.parent.mkdir(parents=True, exist_ok=True)
with archive.open(info) as source, dest.open("wb") as target:
shutil.copyfileobj(source, target)
extracted += 1
return extracted, skipped
def write_manifest(
output_dir: Path, archive_url: str, archives: list[CrimeArchive]
) -> None:
manifest = {
"source": archive_url,
"fetched_at": datetime.now(UTC).isoformat(),
"archives": [asdict(archive) for archive in archives],
}
path = output_dir / "archive_manifest.json"
path.write_text(json.dumps(manifest, indent=2) + "\n")
def _month_arg(value: str) -> str:
if not MONTH_RE.fullmatch(value):
raise argparse.ArgumentTypeError("month must be in YYYY-MM format")
return value
def main() -> None:
parser = argparse.ArgumentParser(
description="Download all monthly police.uk crime archive ZIPs"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Directory for extracted CSVs; ZIPs are kept under _archives/",
)
parser.add_argument(
"--archive-url",
default=ARCHIVE_URL,
help=f"Archive index URL (default: {ARCHIVE_URL})",
)
parser.add_argument(
"--from-month",
type=_month_arg,
help="Only download archives from this YYYY-MM onwards",
)
parser.add_argument(
"--to-month",
type=_month_arg,
help="Only download archives up to this YYYY-MM",
)
parser.add_argument(
"--limit",
type=int,
help="Download at most this many archives after filtering",
)
parser.add_argument(
"--list",
action="store_true",
help="Print the archive URLs that would be downloaded and exit",
)
parser.add_argument(
"--no-extract",
dest="extract",
action="store_false",
help="Download ZIPs only; do not extract CSVs",
)
parser.add_argument(
"--overwrite-extracted",
action="store_true",
help="Overwrite CSVs when extracting overlapping archive snapshots",
)
parser.add_argument(
"--no-verify",
dest="verify",
action="store_false",
help="Skip MD5 verification",
)
parser.add_argument(
"--force",
action="store_true",
help="Redownload archives even if ZIP files already exist",
)
parser.add_argument(
"--timeout",
type=float,
default=600.0,
help="Per-read timeout in seconds for large ZIP downloads",
)
args = parser.parse_args()
print("Fetching police.uk archive index...")
archives = filter_archives(
fetch_archives(args.archive_url),
from_month=args.from_month,
to_month=args.to_month,
limit=args.limit,
)
if not archives:
raise SystemExit("No archives matched the requested filters")
bad_md5 = [
archive.filename for archive in archives if archive.raw_md5 and not archive.md5
]
if bad_md5:
print(
"Warning: ignoring malformed MD5 values for "
+ ", ".join(bad_md5[:5])
+ ("..." if len(bad_md5) > 5 else ""),
file=sys.stderr,
)
print(f"Found {len(archives)} monthly archive ZIPs")
if args.list:
for archive in archives:
print(f"{archive.month}\t{archive.url}\t{archive.raw_md5}")
return
args.output.mkdir(parents=True, exist_ok=True)
archive_dir = args.output / "_archives"
archive_dir.mkdir(parents=True, exist_ok=True)
write_manifest(args.output, args.archive_url, archives)
total_extracted = 0
total_skipped = 0
for index, archive in enumerate(archives, start=1):
print(f"[{index}/{len(archives)}] {archive.label} ({archive.size})")
zip_path = download_archive(
archive,
archive_dir,
verify=args.verify,
force=args.force,
timeout=args.timeout,
)
if args.extract:
extracted, skipped = extract_csvs(
zip_path,
args.output,
overwrite=args.overwrite_extracted,
)
total_extracted += extracted
total_skipped += skipped
print(
f"{archive.filename}: extracted {extracted} CSVs"
+ (f", skipped {skipped} existing CSVs" if skipped else "")
)
if args.extract:
print(
f"Done. ZIPs saved in {archive_dir}; extracted {total_extracted} CSVs"
+ (f" and skipped {total_skipped} existing CSVs" if total_skipped else "")
+ "."
)
else:
print(f"Done. ZIPs saved in {archive_dir}.")
if __name__ == "__main__":
main()