scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
147
pipeline/transform/test_crime_spatial.py
Normal file
147
pipeline/transform/test_crime_spatial.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
import json
|
||||
|
||||
import polars as pl
|
||||
from pyproj import Transformer
|
||||
|
||||
from pipeline.transform.crime_spatial import transform_crime_spatial
|
||||
|
||||
_TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
|
||||
_CSV_HEADER = (
|
||||
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,"
|
||||
"LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
)
|
||||
|
||||
|
||||
def _bng_to_wgs84(x: float, y: float) -> tuple[float, float]:
|
||||
lon, lat = _TO_WGS84.transform(x, y)
|
||||
return lon, lat
|
||||
|
||||
|
||||
def _square_feature(postcode: str, x0: float, y0: float, x1: float, y1: float) -> dict:
|
||||
ring = [(x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0)]
|
||||
coords = [list(_bng_to_wgs84(x, y)) for x, y in ring]
|
||||
return {
|
||||
"type": "Feature",
|
||||
"properties": {"postcodes": postcode, "mapit_code": postcode.replace(" ", "")},
|
||||
"geometry": {"type": "Polygon", "coordinates": [coords]},
|
||||
}
|
||||
|
||||
|
||||
def _write_boundaries(units_dir, features_by_district: dict[str, list[dict]]) -> None:
|
||||
units_dir.mkdir(parents=True)
|
||||
for district, features in features_by_district.items():
|
||||
collection = {"type": "FeatureCollection", "features": features}
|
||||
(units_dir / f"{district}.geojson").write_text(json.dumps(collection))
|
||||
|
||||
|
||||
def _crime_row(month: str, x, y, crime_type: str) -> str:
|
||||
if x is None or y is None:
|
||||
lon, lat = "", ""
|
||||
else:
|
||||
lon, lat = _bng_to_wgs84(x, y)
|
||||
return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U,"
|
||||
|
||||
|
||||
def _write_month(crime_dir, month: str, rows: list[str]) -> None:
|
||||
month_dir = crime_dir / month
|
||||
month_dir.mkdir(parents=True)
|
||||
body = "\n".join([_CSV_HEADER, *rows]) + "\n"
|
||||
(month_dir / f"{month}-test-force-street.csv").write_text(body)
|
||||
|
||||
|
||||
def test_buffer_overlap_counts_for_each_postcode(tmp_path):
|
||||
units = tmp_path / "units"
|
||||
# A and B sit 70m apart; their +50m buffers overlap in x in [1030, 1060].
|
||||
_write_boundaries(
|
||||
units,
|
||||
{
|
||||
"AB1": [
|
||||
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
|
||||
_square_feature("AB1 1AB", 1080, 1000, 1090, 1010),
|
||||
_square_feature("AB1 1AC", 5000, 5000, 5010, 5010),
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
_write_month(
|
||||
crime,
|
||||
"2024-01",
|
||||
[
|
||||
# In the overlap: 35m east of A, 35m west of B -> counts for both.
|
||||
_crime_row("2024-01", 1045, 1005, "Burglary"),
|
||||
# 49m east of C's edge -> inside C's buffer.
|
||||
_crime_row("2024-01", 5059, 5005, "Robbery"),
|
||||
# 51m east of C's edge -> outside every buffer.
|
||||
_crime_row("2024-01", 5061, 5005, "Robbery"),
|
||||
# No coordinate -> dropped entirely.
|
||||
_crime_row("2024-01", None, None, "Anti-social behaviour"),
|
||||
],
|
||||
)
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year)
|
||||
|
||||
rows = {
|
||||
r["postcode"]: r
|
||||
for r in pl.read_parquet(output).to_dicts()
|
||||
}
|
||||
# Single month -> annualised x12.
|
||||
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0
|
||||
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0
|
||||
assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0
|
||||
# Only the 49m robbery counts for C; the 51m one and the blank row do not.
|
||||
assert rows["AB1 1AC"]["Robbery (avg/yr)"] == 12.0
|
||||
assert rows["AB1 1AC"]["Burglary (avg/yr)"] == 0.0
|
||||
# Anti-social behaviour had no coordinate -> nobody gets it.
|
||||
assert all(r["Anti-social behaviour (avg/yr)"] == 0.0 for r in rows.values())
|
||||
|
||||
|
||||
def test_by_year_annualises_and_rolls_up(tmp_path):
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
# Point at the centre of AB1 1AA, well inside its buffer.
|
||||
_write_month(
|
||||
crime,
|
||||
"2023-01",
|
||||
[
|
||||
_crime_row("2023-01", 1005, 1005, "Burglary"),
|
||||
_crime_row("2023-01", 1005, 1005, "Robbery"),
|
||||
],
|
||||
)
|
||||
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
|
||||
_write_month(
|
||||
crime,
|
||||
"2024-02",
|
||||
[
|
||||
_crime_row("2024-02", 1005, 1005, "Burglary"),
|
||||
_crime_row("2024-02", 1005, 1005, "Anti-social behaviour"),
|
||||
],
|
||||
)
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year)
|
||||
|
||||
by_year_df = pl.read_parquet(by_year)
|
||||
assert by_year_df.height == 1
|
||||
cols = set(by_year_df.columns)
|
||||
assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols
|
||||
|
||||
row = by_year_df.row(0, named=True)
|
||||
burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
|
||||
# 2023: 1 burglary in 1 month -> 12/yr; 2024: 2 in 2 months -> 12/yr.
|
||||
assert burglary == [
|
||||
{"year": 2023, "count": 12.0},
|
||||
{"year": 2024, "count": 12.0},
|
||||
]
|
||||
serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
|
||||
# 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
|
||||
assert serious[2023] == 24.0
|
||||
assert serious[2024] == 12.0
|
||||
Loading…
Add table
Add a link
Reference in a new issue