Fun changes
Some checks failed
CI / Python (lint + test) (push) Failing after 3m38s
CI / Rust (lint + test) (push) Failing after 3m32s
CI / Frontend (lint + typecheck) (push) Failing after 4m12s
Build and publish Docker image / build-and-push (push) Failing after 4m48s

This commit is contained in:
Andras Schmelczer 2026-04-04 22:59:44 +01:00
parent cd778dd088
commit 349a6c1d53
60 changed files with 1260 additions and 2600 deletions

View file

@ -0,0 +1,104 @@
import argparse
from pathlib import Path
import httpx
import polars as pl
# UK Parliament publishes candidate-level results for the 2024 General Election.
# One row per candidate per constituency — we aggregate to per-constituency stats.
URL = "https://electionresults.parliament.uk/general-elections/6/candidacies.csv"
# Map party names to a smaller set for the enum feature and vote share columns.
# Only parties that won seats in England are kept; the rest become "Other parties".
PARTY_MAP = {
"Labour": "Labour",
"Conservative": "Conservative",
"Liberal Democrat": "Liberal Democrat",
"Reform UK": "Reform UK",
"Green Party": "Green",
}
def download_and_convert(output_path: Path) -> None:
print("Downloading 2024 General Election results...")
response = httpx.get(URL, follow_redirects=True, timeout=60)
response.raise_for_status()
df = pl.read_csv(response.content)
print(f"Raw shape: {df.shape}")
# Filter to England only (constituency codes starting with E14)
df = df.filter(pl.col("Constituency geographic code").str.starts_with("E14"))
# Map party names to our output groups
df = df.with_columns(
pl.col("Main party name")
.replace_strict(PARTY_MAP, default="Other parties")
.alias("party_group"),
)
# ── Per-constituency winner stats ──
winners = df.filter(pl.col("Candidate result position") == 1).select(
pl.col("Constituency geographic code").alias("pcon"),
pl.col("party_group").alias("winning_party"),
(pl.col("Majority") / pl.col("Election valid vote count") * 100)
.round(1)
.alias("majority_pct"),
(pl.col("Election valid vote count") / pl.col("Electorate") * 100)
.round(1)
.alias("turnout_pct"),
)
# ── Per-party vote share percentages ──
# Sum votes per party group per constituency, then pivot to wide format
party_votes = (
df.group_by("Constituency geographic code", "party_group")
.agg(pl.col("Candidate vote count").sum())
.rename({"Constituency geographic code": "pcon"})
)
total_votes = (
df.group_by("Constituency geographic code")
.agg(pl.col("Candidate vote count").sum().alias("total_votes"))
.rename({"Constituency geographic code": "pcon"})
)
party_pct = (
party_votes.join(total_votes, on="pcon")
.with_columns(
(pl.col("Candidate vote count") / pl.col("total_votes") * 100)
.round(1)
.alias("vote_pct"),
)
.pivot(on="party_group", index="pcon", values="vote_pct")
)
# Rename columns to "% Party" format
rename_map = {col: f"% {col}" for col in party_pct.columns if col != "pcon"}
party_pct = party_pct.rename(rename_map)
# Join winner stats with party vote shares
result = winners.join(party_pct, on="pcon", how="left")
print(f"Constituencies: {result.height}")
print(f"Columns: {result.columns}")
print(
f"Party breakdown:\n{result['winning_party'].value_counts().sort('count', descending=True)}"
)
output_path.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download 2024 General Election results by constituency"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()

View file

@ -57,11 +57,33 @@ def download_and_convert(output_path: Path) -> None:
pl.col("Ethnicity").replace_strict(group_map).alias("group"),
)
# Sum percentages within each group per local authority
wide = (
detailed.group_by("Geography_code", "group")
.agg(pl.col("Value1").sum().round(1))
.pivot(on="group", index="Geography_code", values="Value1")
# Sum percentages within each group per local authority (keep full precision)
grouped = detailed.group_by("Geography_code", "group").agg(pl.col("Value1").sum())
wide = grouped.pivot(on="group", index="Geography_code", values="Value1")
# Normalize so each row sums to exactly 100%, then round using largest-remainder
# method to preserve the sum. Independent rounding of 6 values can drift ±0.3.
group_cols = [c for c in wide.columns if c != "Geography_code"]
row_total = sum(pl.col(c) for c in group_cols)
# Scale each group so they sum to exactly 100
wide = wide.with_columns(
[(pl.col(c) / row_total * 100.0).alias(c) for c in group_cols]
)
# Round to 1 decimal, then adjust the largest group to absorb residual
rounded_cols = [pl.col(c).round(1).alias(c) for c in group_cols]
wide = wide.with_columns(rounded_cols)
rounded_sum = sum(pl.col(c) for c in group_cols)
residual = (100.0 - rounded_sum).round(1)
# Find which group is largest per row and add the residual there
largest_col = pl.concat_list(group_cols).list.arg_max()
wide = wide.with_columns(
[
pl.when(largest_col == i)
.then(pl.col(c) + residual)
.otherwise(pl.col(c))
.alias(c)
for i, c in enumerate(group_cols)
]
)
# Rename columns to be descriptive

View file

@ -17,8 +17,8 @@ STOP_TYPES = {
"BCT": "Bus stop",
"BCE": "Bus station",
"TXR": "Taxi rank",
"TMU": "Metro or Tram stop",
"MET": "Metro or Tram stop",
"TMU": "Tube station",
"MET": "Tube station",
}

View file

@ -49,7 +49,7 @@ _AREA_COLUMNS = [
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 2km",
"Number of parks within 1km",
"Distance to nearest train or tube station (km)",
"Distance to nearest park (km)",
# Environment
@ -62,6 +62,16 @@ _AREA_COLUMNS = [
"Good+ secondary schools within 2km",
# Demographics
"Median age",
# Politics
"Winning party",
"Voter turnout (%)",
"Majority (%)",
"% Labour",
"% Conservative",
"% Liberal Democrat",
"% Reform UK",
"% Green",
"% Other parties",
]
@ -78,6 +88,7 @@ def _build(
rental_prices_path: Path,
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
@ -113,6 +124,7 @@ def _build(
pl.col("long").alias("lon"),
"lsoa21",
"oa21",
"pcon",
)
)
wide = wide.join(arcgis, on="postcode", how="left")
@ -193,6 +205,9 @@ def _build(
median_age = pl.scan_parquet(median_age_path)
wide = wide.join(median_age, on="lsoa21", how="left")
election = pl.scan_parquet(election_results_path)
wide = wide.join(election, on="pcon", how="left")
poi_counts = pl.scan_parquet(poi_proximity_path)
wide = wide.join(poi_counts, on="postcode", how="left")
@ -304,6 +319,7 @@ def _build(
"Barriers to Housing and Services Score",
"lsoa21",
"oa21",
"pcon",
"epc_property_type",
"pp_property_type",
"built_form",
@ -323,7 +339,7 @@ def _build(
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"parks_2km": "Number of parks within 2km",
"parks_1km": "Number of parks within 1km",
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
"parks_nearest_km": "Distance to nearest park (km)",
"latest_price": "Last known price",
@ -342,6 +358,9 @@ def _build(
"floor_height": "Interior height (m)",
"was_council_house": "Former council house",
"median_age": "Median age",
"winning_party": "Winning party",
"turnout_pct": "Voter turnout (%)",
"majority_pct": "Majority (%)",
}
)
)
@ -427,6 +446,12 @@ def main():
required=True,
help="Census 2021 median age by LSOA parquet file",
)
parser.add_argument(
"--election-results",
type=Path,
required=True,
help="2024 General Election results by constituency parquet file",
)
parser.add_argument(
"--output-postcodes",
type=Path,
@ -454,6 +479,7 @@ def main():
rental_prices_path=args.rental_prices,
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
)
print(f"\nPostcode columns: {postcode_df.columns}")

View file

@ -17,7 +17,7 @@ POI_GROUPS_2KM = {
# Groups for which to compute distance to nearest POI (from filtered POIs)
DISTANCE_GROUPS = {
"train_tube": ["Metro or Tram stop", "Rail station"],
"train_tube": ["Tube station", "Rail station"],
}
# OS Open Greenspace function types used for park counts and distance calculation.
@ -67,8 +67,8 @@ def main():
# Park counts and distances from OS Open Greenspace
greenspace = pl.read_parquet(args.greenspace)
park_counts_2km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
park_counts_1km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=1
)
park_distances = min_distance_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
@ -77,7 +77,7 @@ def main():
# Join all results on postcode
result = (
counts_2km.join(distances, on="postcode")
.join(park_counts_2km, on="postcode")
.join(park_counts_1km, on="postcode")
.join(park_distances, on="postcode")
)

View file

@ -1054,7 +1054,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
"Bus stop": "🚏",
"Bus station": "🚌",
"Taxi rank": "🚕",
"Metro or Tram stop": "🚊",
"Tube station": "🚇",
}

View file

@ -8,7 +8,7 @@ POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Supermarket"],
"parks": ["Park"],
"train_tube": ["Rail station", "Metro or Tram stop"],
"train_tube": ["Rail station", "Tube station"],
}