Rerun data pipelines

2026-05-10 14:49:53 +01:00 · 2026-05-10 14:49:53 +01:00 · fc10381692
commit fc10381692
parent 4c95815dc8
27 changed files with 2143 additions and 215 deletions
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -1,12 +1,32 @@
 import argparse
+import re
 from pathlib import Path

 import polars as pl

+STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
+
+
+def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
+    csvs = sorted(crime_dir.rglob("*.csv"))
+    street_csvs = [path for path in csvs if STREET_CRIME_CSV_RE.fullmatch(path.name)]
+    return street_csvs, len(csvs) - len(street_csvs)
+

 def transform_crime(crime_dir: Path, output_path: Path) -> None:
-    csvs = sorted(crime_dir.rglob("*.csv"))
-    print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
+    csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
+    if not csvs:
+        raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
+
+    month_count = len({path.parent.name for path in csvs})
+    print(
+        f"Found {len(csvs)} street crime CSV files across {month_count} months"
+        + (
+            f" (ignored {ignored_csv_count} non-street CSVs)"
+            if ignored_csv_count
+            else ""
+        )
+    )

    df = pl.scan_csv(
        csvs,