Rerun data pipelines
This commit is contained in:
parent
4c95815dc8
commit
fc10381692
27 changed files with 2143 additions and 215 deletions
|
|
@ -1,12 +1,32 @@
|
|||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
|
||||
|
||||
|
||||
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
street_csvs = [path for path in csvs if STREET_CRIME_CSV_RE.fullmatch(path.name)]
|
||||
return street_csvs, len(csvs) - len(street_csvs)
|
||||
|
||||
|
||||
def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
|
||||
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
|
||||
if not csvs:
|
||||
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
|
||||
|
||||
month_count = len({path.parent.name for path in csvs})
|
||||
print(
|
||||
f"Found {len(csvs)} street crime CSV files across {month_count} months"
|
||||
+ (
|
||||
f" (ignored {ignored_csv_count} non-street CSVs)"
|
||||
if ignored_csv_count
|
||||
else ""
|
||||
)
|
||||
)
|
||||
|
||||
df = pl.scan_csv(
|
||||
csvs,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue