LGTM
This commit is contained in:
parent
a8165249a4
commit
a4103b0896
64 changed files with 5376 additions and 3832 deletions
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
|
||||
MONTH_RE = r"^\d{4}-\d{2}$"
|
||||
|
||||
|
||||
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
||||
|
|
@ -37,16 +38,45 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|||
},
|
||||
).select("LSOA code", "Crime type", "Month")
|
||||
|
||||
# Extract year, count crimes per LSOA / year / crime type
|
||||
valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
|
||||
valid_months = (
|
||||
df.filter(valid_month_expr)
|
||||
.select("Month")
|
||||
.unique()
|
||||
.collect(engine="streaming")["Month"]
|
||||
.sort()
|
||||
.to_list()
|
||||
)
|
||||
if not valid_months:
|
||||
raise ValueError(f"No valid crime months found in {crime_dir}")
|
||||
|
||||
valid_month_count = len(valid_months)
|
||||
print(
|
||||
f"Using {valid_month_count} valid data months "
|
||||
f"({valid_months[0]} to {valid_months[-1]})"
|
||||
)
|
||||
|
||||
# Count monthly incidents, then annualise over every valid month in the dataset.
|
||||
yearly_counts = (
|
||||
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
|
||||
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
|
||||
.group_by("LSOA code", "year", "Crime type")
|
||||
df.filter(
|
||||
valid_month_expr
|
||||
& pl.col("LSOA code").is_not_null()
|
||||
& (pl.col("LSOA code") != "")
|
||||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
)
|
||||
.group_by("LSOA code", "Month", "Crime type")
|
||||
.agg(pl.len().alias("count"))
|
||||
.group_by("LSOA code", "Crime type")
|
||||
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
|
||||
.agg(
|
||||
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
|
||||
.round(1)
|
||||
.alias("yearly_avg")
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
if yearly_counts.is_empty():
|
||||
raise ValueError(f"No valid crime rows found in {crime_dir}")
|
||||
|
||||
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue