perfect-postcode/pipeline/download/test_crime.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

196 lines
7.1 KiB
Python

from zipfile import ZipFile
from pipeline.download.crime import (
CrimeArchive,
extract_csvs,
prepare_archive_dir,
prune_unused_csvs,
select_coverage_archives,
parse_archives,
)
def test_parse_archives_reads_monthly_zip_links_only():
html = """
<p><a href="/data/archive/latest.zip">latest.zip</a></p>
<div class="archive crime">
<div class="download">
<i class="icon-file"></i> <span><a href="/data/archive/2026-03.zip">March 2026</a> (1.6&nbsp;GB)</span>
<p class="contained-range">Contains data from Apr 2023 to Mar 2026</p>
<p class="md5sum">6dde462489389445877f3988ef3f4f4b</p>
</div>
<div class="download">
<i class="icon-file"></i> <span><a href="/data/archive/2019-06.zip">June 2019</a> (1.6&nbsp;GB)</span>
<p class="contained-range">Contains data from Jul 2016 to Jun 2019</p>
<p class="md5sum">d6494297b24c1434bdb2504e95261bf8-100</p>
</div>
</div>
<div class="archive neighbourhood">
<div class="download">
<span><a href="/data/neighbourhood.zip">Neighbourhood crime</a> (2.2 MB)</span>
<small class="md5sum">6b80e2b97d87f6668b7a45953924d191</small>
</div>
</div>
"""
archives = parse_archives(html, "https://data.police.uk/data/archive/")
assert [archive.filename for archive in archives] == [
"2026-03.zip",
"2019-06.zip",
]
assert archives[0].url == "https://data.police.uk/data/archive/2026-03.zip"
assert archives[0].md5 == "6dde462489389445877f3988ef3f4f4b"
assert archives[1].md5 is None
assert archives[1].raw_md5 == "d6494297b24c1434bdb2504e95261bf8-100"
def test_extract_csvs_preserves_existing_newer_files(tmp_path):
zip_path = tmp_path / "older.zip"
output = tmp_path / "crime"
existing = output / "2023-01" / "2023-01-city-street.csv"
existing.parent.mkdir(parents=True)
existing.write_text("newer\n")
with ZipFile(zip_path, "w") as archive:
archive.writestr("2023-01/2023-01-city-street.csv", "older\n")
archive.writestr("2022-12/2022-12-city-street.csv", "old\n")
archive.writestr("2022-12/2022-12-city-outcomes.csv", "unused\n")
archive.writestr("2022-12/2022-12-city-stop-and-search.csv", "unused\n")
archive.writestr("../escape.csv", "bad\n")
archive.writestr("notes.txt", "ignored\n")
extracted, skipped = extract_csvs(zip_path, output)
assert extracted == 1
assert skipped == 1
assert existing.read_text() == "newer\n"
assert (output / "2022-12" / "2022-12-city-street.csv").read_text() == "old\n"
assert not (output / "2022-12" / "2022-12-city-outcomes.csv").exists()
assert not (output / "2022-12" / "2022-12-city-stop-and-search.csv").exists()
assert not (tmp_path / "escape.csv").exists()
def _archive(month: str, contained_range: str) -> CrimeArchive:
return CrimeArchive(
month=month,
label=month,
url=f"https://data.police.uk/data/archive/{month}.zip",
filename=f"{month}.zip",
size="1.0 GB",
contained_range=contained_range,
md5=None,
raw_md5="",
)
def test_select_coverage_archives_skips_overlapping_snapshots():
archives = [
_archive("2026-03", "Contains data from Apr 2023 to Mar 2026"),
_archive("2026-02", "Contains data from Mar 2023 to Feb 2026"),
_archive("2023-04", "Contains data from May 2020 to Apr 2023"),
_archive("2023-03", "Contains data from Apr 2020 to Mar 2023"),
]
selected = select_coverage_archives(archives)
assert [archive.month for archive in selected] == ["2026-03", "2023-03"]
def test_select_coverage_archives_falls_back_to_overlapping_snapshot():
# The exactly-adjacent snapshot (ending Mar 2023) is missing from the
# index; the overlapping 2023-06 snapshot must be selected rather than
# leaving an Apr-Jun 2023 hole in the history.
archives = [
_archive("2026-03", "Contains data from Apr 2023 to Mar 2026"),
_archive("2023-06", "Contains data from Jul 2020 to Jun 2023"),
]
selected = select_coverage_archives(archives)
assert [archive.month for archive in selected] == ["2026-03", "2023-06"]
def test_select_coverage_archives_raises_on_publication_gap():
archives = [
_archive("2026-03", "Contains data from Apr 2023 to Mar 2026"),
_archive("2021-12", "Contains data from Jan 2019 to Dec 2021"),
]
try:
select_coverage_archives(archives)
except RuntimeError as exc:
assert "2022-01 to 2023-03" in str(exc)
else:
raise AssertionError("Expected RuntimeError for the 2022 hole")
selected = select_coverage_archives(archives, allow_gaps=True)
assert [archive.month for archive in selected] == ["2026-03", "2021-12"]
def test_extract_csvs_newest_snapshot_wins_within_a_run(tmp_path):
# Archives are processed newest first with a shared extracted-set: the
# older overlapping snapshot must not replace a month the newer one wrote,
# but months from previous runs ARE replaced (police.uk revises the
# trailing 36 months in every release).
newer_zip = tmp_path / "newer.zip"
older_zip = tmp_path / "older.zip"
output = tmp_path / "crime"
stale = output / "2023-01" / "2023-01-city-street.csv"
stale.parent.mkdir(parents=True)
stale.write_text("stale revision from a previous run\n")
with ZipFile(newer_zip, "w") as archive:
archive.writestr("2023-01/2023-01-city-street.csv", "revised\n")
with ZipFile(older_zip, "w") as archive:
archive.writestr("2023-01/2023-01-city-street.csv", "older snapshot\n")
archive.writestr("2022-12/2022-12-city-street.csv", "unique month\n")
extracted_this_run: set = set()
extract_csvs(
newer_zip, output, overwrite=True, extracted_this_run=extracted_this_run
)
extract_csvs(
older_zip, output, overwrite=True, extracted_this_run=extracted_this_run
)
assert stale.read_text() == "revised\n"
assert (output / "2022-12" / "2022-12-city-street.csv").read_text() == (
"unique month\n"
)
def test_prepare_archive_dir_removes_retained_zip_cache_by_default(tmp_path):
output = tmp_path / "crime"
retained = output / "_archives"
temp = output / "_download_tmp"
retained.mkdir(parents=True)
temp.mkdir()
(retained / "old.zip").write_text("zip\n")
(temp / "old.zip.part").write_text("part\n")
archive_dir = prepare_archive_dir(output, keep_archives=False)
assert archive_dir == temp
assert archive_dir.exists()
assert list(archive_dir.iterdir()) == []
assert not retained.exists()
def test_prune_unused_csvs_removes_non_street_csvs(tmp_path):
output = tmp_path / "crime"
month_dir = output / "2024-01"
month_dir.mkdir(parents=True)
street = month_dir / "2024-01-city-street.csv"
outcomes = month_dir / "2024-01-city-outcomes.csv"
stop_search = month_dir / "2024-01-city-stop-and-search.csv"
street.write_text("street\n")
outcomes.write_text("outcomes\n")
stop_search.write_text("stop\n")
removed, _ = prune_unused_csvs(output)
assert removed == 2
assert street.exists()
assert not outcomes.exists()
assert not stop_search.exists()