Fix crime & add actual listings
This commit is contained in:
parent
017902b8e6
commit
ebe7bbb51d
34 changed files with 2014 additions and 172754 deletions
|
|
@ -1,6 +1,13 @@
|
|||
from zipfile import ZipFile
|
||||
|
||||
from pipeline.download.crime import extract_csvs, parse_archives
|
||||
from pipeline.download.crime import (
|
||||
CrimeArchive,
|
||||
extract_csvs,
|
||||
prepare_archive_dir,
|
||||
prune_unused_csvs,
|
||||
select_coverage_archives,
|
||||
parse_archives,
|
||||
)
|
||||
|
||||
|
||||
def test_parse_archives_reads_monthly_zip_links_only():
|
||||
|
|
@ -48,6 +55,8 @@ def test_extract_csvs_preserves_existing_newer_files(tmp_path):
|
|||
with ZipFile(zip_path, "w") as archive:
|
||||
archive.writestr("2023-01/2023-01-city-street.csv", "older\n")
|
||||
archive.writestr("2022-12/2022-12-city-street.csv", "old\n")
|
||||
archive.writestr("2022-12/2022-12-city-outcomes.csv", "unused\n")
|
||||
archive.writestr("2022-12/2022-12-city-stop-and-search.csv", "unused\n")
|
||||
archive.writestr("../escape.csv", "bad\n")
|
||||
archive.writestr("notes.txt", "ignored\n")
|
||||
|
||||
|
|
@ -57,4 +66,68 @@ def test_extract_csvs_preserves_existing_newer_files(tmp_path):
|
|||
assert skipped == 1
|
||||
assert existing.read_text() == "newer\n"
|
||||
assert (output / "2022-12" / "2022-12-city-street.csv").read_text() == "old\n"
|
||||
assert not (output / "2022-12" / "2022-12-city-outcomes.csv").exists()
|
||||
assert not (output / "2022-12" / "2022-12-city-stop-and-search.csv").exists()
|
||||
assert not (tmp_path / "escape.csv").exists()
|
||||
|
||||
|
||||
def _archive(month: str, contained_range: str) -> CrimeArchive:
|
||||
return CrimeArchive(
|
||||
month=month,
|
||||
label=month,
|
||||
url=f"https://data.police.uk/data/archive/{month}.zip",
|
||||
filename=f"{month}.zip",
|
||||
size="1.0 GB",
|
||||
contained_range=contained_range,
|
||||
md5=None,
|
||||
raw_md5="",
|
||||
)
|
||||
|
||||
|
||||
def test_select_coverage_archives_skips_overlapping_snapshots():
|
||||
archives = [
|
||||
_archive("2026-03", "Contains data from Apr 2023 to Mar 2026"),
|
||||
_archive("2026-02", "Contains data from Mar 2023 to Feb 2026"),
|
||||
_archive("2023-04", "Contains data from May 2020 to Apr 2023"),
|
||||
_archive("2023-03", "Contains data from Apr 2020 to Mar 2023"),
|
||||
]
|
||||
|
||||
selected = select_coverage_archives(archives)
|
||||
|
||||
assert [archive.month for archive in selected] == ["2026-03", "2023-03"]
|
||||
|
||||
|
||||
def test_prepare_archive_dir_removes_retained_zip_cache_by_default(tmp_path):
|
||||
output = tmp_path / "crime"
|
||||
retained = output / "_archives"
|
||||
temp = output / "_download_tmp"
|
||||
retained.mkdir(parents=True)
|
||||
temp.mkdir()
|
||||
(retained / "old.zip").write_text("zip\n")
|
||||
(temp / "old.zip.part").write_text("part\n")
|
||||
|
||||
archive_dir = prepare_archive_dir(output, keep_archives=False)
|
||||
|
||||
assert archive_dir == temp
|
||||
assert archive_dir.exists()
|
||||
assert list(archive_dir.iterdir()) == []
|
||||
assert not retained.exists()
|
||||
|
||||
|
||||
def test_prune_unused_csvs_removes_non_street_csvs(tmp_path):
|
||||
output = tmp_path / "crime"
|
||||
month_dir = output / "2024-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
street = month_dir / "2024-01-city-street.csv"
|
||||
outcomes = month_dir / "2024-01-city-outcomes.csv"
|
||||
stop_search = month_dir / "2024-01-city-stop-and-search.csv"
|
||||
street.write_text("street\n")
|
||||
outcomes.write_text("outcomes\n")
|
||||
stop_search.write_text("stop\n")
|
||||
|
||||
removed, _ = prune_unused_csvs(output)
|
||||
|
||||
assert removed == 2
|
||||
assert street.exists()
|
||||
assert not outcomes.exists()
|
||||
assert not stop_search.exists()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue