from zipfile import ZipFile
from pipeline.download.crime import extract_csvs, parse_archives
def test_parse_archives_reads_monthly_zip_links_only():
html = """
latest.zip
March 2026 (1.6 GB)
Contains data from Apr 2023 to Mar 2026
6dde462489389445877f3988ef3f4f4b
June 2019 (1.6 GB)
Contains data from Jul 2016 to Jun 2019
d6494297b24c1434bdb2504e95261bf8-100
"""
archives = parse_archives(html, "https://data.police.uk/data/archive/")
assert [archive.filename for archive in archives] == [
"2026-03.zip",
"2019-06.zip",
]
assert archives[0].url == "https://data.police.uk/data/archive/2026-03.zip"
assert archives[0].md5 == "6dde462489389445877f3988ef3f4f4b"
assert archives[1].md5 is None
assert archives[1].raw_md5 == "d6494297b24c1434bdb2504e95261bf8-100"
def test_extract_csvs_preserves_existing_newer_files(tmp_path):
zip_path = tmp_path / "older.zip"
output = tmp_path / "crime"
existing = output / "2023-01" / "2023-01-city-street.csv"
existing.parent.mkdir(parents=True)
existing.write_text("newer\n")
with ZipFile(zip_path, "w") as archive:
archive.writestr("2023-01/2023-01-city-street.csv", "older\n")
archive.writestr("2022-12/2022-12-city-street.csv", "old\n")
archive.writestr("../escape.csv", "bad\n")
archive.writestr("notes.txt", "ignored\n")
extracted, skipped = extract_csvs(zip_path, output)
assert extracted == 1
assert skipped == 1
assert existing.read_text() == "newer\n"
assert (output / "2022-12" / "2022-12-city-street.csv").read_text() == "old\n"
assert not (tmp_path / "escape.csv").exists()