Lint
This commit is contained in:
parent
94f9c0d594
commit
5c3b87f2d5
69 changed files with 1334 additions and 213 deletions
|
|
@ -81,11 +81,7 @@ def find_bad_files(
|
|||
bad: list[BadFile] = []
|
||||
stats: dict[str, dict] = {}
|
||||
|
||||
modes = sorted(
|
||||
d
|
||||
for d in os.listdir(base_dir)
|
||||
if (base_dir / d).is_dir()
|
||||
)
|
||||
modes = sorted(d for d in os.listdir(base_dir) if (base_dir / d).is_dir())
|
||||
|
||||
for mode in modes:
|
||||
mode_dir = base_dir / mode
|
||||
|
|
@ -149,7 +145,9 @@ def find_duplicates(base_dir: Path) -> tuple[list[BadFile], dict[str, dict]]:
|
|||
# Keep the file with the most rows
|
||||
files.sort(key=lambda x: x[1], reverse=True)
|
||||
for filename, rows in files[1:]:
|
||||
dupes.append(BadFile(mode=mode, filename=filename, slug=slug, rows=rows))
|
||||
dupes.append(
|
||||
BadFile(mode=mode, filename=filename, slug=slug, rows=rows)
|
||||
)
|
||||
mode_dupes += 1
|
||||
|
||||
duped_slugs = sum(1 for fs in slug_files.values() if len(fs) > 1)
|
||||
|
|
@ -197,7 +195,9 @@ def main() -> None:
|
|||
bad_files, stats = find_bad_files(args.travel_times, args.threshold_pct)
|
||||
|
||||
print("=== Per-mode summary ===\n")
|
||||
print(f"{'Mode':<10} {'Total':>6} {'Bad':>5} {'Threshold':>10} {'Median':>8} {'Range':>20}")
|
||||
print(
|
||||
f"{'Mode':<10} {'Total':>6} {'Bad':>5} {'Threshold':>10} {'Median':>8} {'Range':>20}"
|
||||
)
|
||||
print("-" * 65)
|
||||
for mode, s in sorted(stats.items()):
|
||||
rng = f"{s['min']:,}–{s['max']:,}"
|
||||
|
|
@ -231,7 +231,9 @@ def main() -> None:
|
|||
total_removable = sum(s["removable"] for s in dupe_stats.values())
|
||||
if total_removable > 0:
|
||||
print(f"\n=== Duplicates ({total_removable} removable files) ===\n")
|
||||
print(f"{'Mode':<10} {'Total':>6} {'Unique':>7} {'Duped slugs':>12} {'Removable':>10}")
|
||||
print(
|
||||
f"{'Mode':<10} {'Total':>6} {'Unique':>7} {'Duped slugs':>12} {'Removable':>10}"
|
||||
)
|
||||
print("-" * 50)
|
||||
for mode, s in sorted(dupe_stats.items()):
|
||||
if s["removable"] > 0:
|
||||
|
|
@ -242,9 +244,15 @@ def main() -> None:
|
|||
|
||||
if args.dedup:
|
||||
# Exclude files already deleted by --delete
|
||||
deleted_set = {(bf.mode, bf.filename) for bf in bad_files} if args.delete else set()
|
||||
to_delete = [df for df in dupe_files if (df.mode, df.filename) not in deleted_set]
|
||||
print(f"\nRemoving {len(to_delete)} duplicate files (keeping largest per slug)...")
|
||||
deleted_set = (
|
||||
{(bf.mode, bf.filename) for bf in bad_files} if args.delete else set()
|
||||
)
|
||||
to_delete = [
|
||||
df for df in dupe_files if (df.mode, df.filename) not in deleted_set
|
||||
]
|
||||
print(
|
||||
f"\nRemoving {len(to_delete)} duplicate files (keeping largest per slug)..."
|
||||
)
|
||||
deleted = _delete_files(args.travel_times, to_delete)
|
||||
print(f"Deleted {deleted}/{len(to_delete)} files.")
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -42,9 +42,7 @@ def select_latest_csv_name(names: list[str]) -> str:
|
|||
match = CSV_NAME_RE.match(path.name)
|
||||
if not match:
|
||||
continue
|
||||
candidates.append(
|
||||
(match.group("release"), int(match.group("version")), name)
|
||||
)
|
||||
candidates.append((match.group("release"), int(match.group("version")), name))
|
||||
|
||||
if not candidates:
|
||||
raise ValueError("No root-level GEOLYTIX retail points CSV found")
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
|
|||
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
|
||||
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
|
||||
TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72"
|
||||
POI_ICON_BASE = "https://geolytix.github.io/MapIcons"
|
||||
|
||||
# Font stacks used by @protomaps/basemaps with lang='en'
|
||||
FONT_STACKS = ["Noto Sans Regular", "Noto Sans Italic", "Noto Sans Medium"]
|
||||
|
|
@ -16,6 +17,50 @@ FONT_STACKS = ["Noto Sans Regular", "Noto Sans Italic", "Noto Sans Medium"]
|
|||
# Fallback emoji not in any category
|
||||
_FALLBACK_EMOJIS = ["📍"]
|
||||
|
||||
POI_ICON_PATHS = [
|
||||
"asda/asda_express_24px.svg",
|
||||
"asda/asda_green_basket_24px.svg",
|
||||
"asda/asda_green_trolley_24px.svg",
|
||||
"asda/asda_living_24px.svg",
|
||||
"asda/asda_pfs_24px.svg",
|
||||
"asda/asda_primary.svg",
|
||||
"asda/asda_superstore_green_trolley_24px.svg",
|
||||
"brands/aldi_24px.svg",
|
||||
"brands/amazon_fresh_alt_24px.svg",
|
||||
"brands/booths_24px.svg",
|
||||
"brands/budgens_24px.svg",
|
||||
"brands/centra_24px.svg",
|
||||
"brands/cook.svg",
|
||||
"brands/coop_24px.svg",
|
||||
"brands/costco_24px.svg",
|
||||
"brands/dunnes_stores_24px.svg",
|
||||
"brands/farmfoods_updated_24px.svg",
|
||||
"brands/heron_24px.svg",
|
||||
"brands/iceland_24px.svg",
|
||||
"brands/iceland_food_warehouse_24px.svg",
|
||||
"brands/lidl_24px.svg",
|
||||
"brands/little_waitrose_24px.svg",
|
||||
"brands/makro_24px.svg",
|
||||
"brands/mns_24px.svg",
|
||||
"brands/mns_food_24px.svg",
|
||||
"brands/mns_high_street_24px.svg",
|
||||
"brands/mns_hospital_24px.svg",
|
||||
"brands/mns_moto_24px.svg",
|
||||
"brands/mns_outlet_24px.svg",
|
||||
"brands/morrisons_24px.svg",
|
||||
"brands/morrisons_daily_24px.svg",
|
||||
"brands/sainsburys_24px.svg",
|
||||
"brands/sainsburys_local_24px.svg",
|
||||
"brands/spar_24px.svg",
|
||||
"brands/tesco_24px.svg",
|
||||
"brands/tesco_express_24px.svg",
|
||||
"brands/tesco_extra_24px.svg",
|
||||
"brands/waitrose_24px.svg",
|
||||
"brands/wholefoods_24px.svg",
|
||||
"logos/planet_organic_24px.svg",
|
||||
"public_transport/london_tube.svg",
|
||||
]
|
||||
|
||||
|
||||
def collect_twemoji_codes() -> list[str]:
|
||||
"""Derive twemoji hex codes from transform_poi categories.
|
||||
|
|
@ -93,6 +138,12 @@ def main():
|
|||
url = f"{TWEMOJI_BASE}/{code}.png"
|
||||
tasks.append((url, twemoji_dir / f"{code}.png"))
|
||||
|
||||
# Branded POI icons are served from this local bundle at runtime.
|
||||
poi_icons_dir = out / "poi-icons"
|
||||
for icon_path in POI_ICON_PATHS:
|
||||
url = f"{POI_ICON_BASE}/{icon_path}"
|
||||
tasks.append((url, poi_icons_dir / icon_path))
|
||||
|
||||
# Skip already-downloaded files
|
||||
remaining = [(url, dest) for url, dest in tasks]
|
||||
|
||||
|
|
|
|||
|
|
@ -23,24 +23,24 @@ PAGE_SIZE = 25000
|
|||
# Five-year age bands in order, with lower bounds for interpolation.
|
||||
# The last band (85+) is open-ended — we treat it as 85-89 for median purposes.
|
||||
AGE_BANDS = [
|
||||
(0, 5), # Aged 0 to 4 years
|
||||
(5, 5), # Aged 5 to 9 years
|
||||
(10, 5), # Aged 10 to 14 years
|
||||
(15, 5), # Aged 15 to 19 years
|
||||
(20, 5), # Aged 20 to 24 years
|
||||
(25, 5), # Aged 25 to 29 years
|
||||
(30, 5), # Aged 30 to 34 years
|
||||
(35, 5), # Aged 35 to 39 years
|
||||
(40, 5), # Aged 40 to 44 years
|
||||
(45, 5), # Aged 45 to 49 years
|
||||
(50, 5), # Aged 50 to 54 years
|
||||
(55, 5), # Aged 55 to 59 years
|
||||
(60, 5), # Aged 60 to 64 years
|
||||
(65, 5), # Aged 65 to 69 years
|
||||
(70, 5), # Aged 70 to 74 years
|
||||
(75, 5), # Aged 75 to 79 years
|
||||
(80, 5), # Aged 80 to 84 years
|
||||
(85, 5), # Aged 85 years and over
|
||||
(0, 5), # Aged 0 to 4 years
|
||||
(5, 5), # Aged 5 to 9 years
|
||||
(10, 5), # Aged 10 to 14 years
|
||||
(15, 5), # Aged 15 to 19 years
|
||||
(20, 5), # Aged 20 to 24 years
|
||||
(25, 5), # Aged 25 to 29 years
|
||||
(30, 5), # Aged 30 to 34 years
|
||||
(35, 5), # Aged 35 to 39 years
|
||||
(40, 5), # Aged 40 to 44 years
|
||||
(45, 5), # Aged 45 to 49 years
|
||||
(50, 5), # Aged 50 to 54 years
|
||||
(55, 5), # Aged 55 to 59 years
|
||||
(60, 5), # Aged 60 to 64 years
|
||||
(65, 5), # Aged 65 to 69 years
|
||||
(70, 5), # Aged 70 to 74 years
|
||||
(75, 5), # Aged 75 to 79 years
|
||||
(80, 5), # Aged 80 to 84 years
|
||||
(85, 5), # Aged 85 years and over
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -110,14 +110,18 @@ def download_and_convert(output_path: Path) -> None:
|
|||
for row in rows:
|
||||
counts = [row[col] for col in band_cols]
|
||||
median = compute_median_age(counts)
|
||||
medians.append({"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)})
|
||||
medians.append(
|
||||
{"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
|
||||
)
|
||||
|
||||
result = pl.DataFrame(medians).with_columns(
|
||||
pl.col("median_age").cast(pl.Float32),
|
||||
)
|
||||
|
||||
print(f"England LSOAs: {result.height}")
|
||||
print(f"Median age range: {result['median_age'].min()} - {result['median_age'].max()}")
|
||||
print(
|
||||
f"Median age range: {result['median_age'].min()} - {result['median_age'].max()}"
|
||||
)
|
||||
print(f"Mean of medians: {result['median_age'].mean():.1f}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
|
|
|||
|
|
@ -43,9 +43,7 @@ def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
|
|||
|
||||
# Filter to English local authorities
|
||||
df = df.filter(
|
||||
pl.any_horizontal(
|
||||
pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES
|
||||
)
|
||||
pl.any_horizontal(pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES)
|
||||
)
|
||||
|
||||
# Use only the latest month
|
||||
|
|
|
|||
|
|
@ -23,7 +23,9 @@ def find_latest_build() -> str:
|
|||
for i in range(MAX_AGE_DAYS):
|
||||
d = today - timedelta(days=i)
|
||||
url = f"{PROTOMAPS_BASE}/{d:%Y%m%d}.pmtiles"
|
||||
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": USER_AGENT})
|
||||
req = urllib.request.Request(
|
||||
url, method="HEAD", headers={"User-Agent": USER_AGENT}
|
||||
)
|
||||
try:
|
||||
urllib.request.urlopen(req)
|
||||
print(f"Found build: {d:%Y%m%d}")
|
||||
|
|
|
|||
|
|
@ -128,9 +128,7 @@ def main():
|
|||
|
||||
# Social tenure fork: flag properties that were ever social housing
|
||||
social_tenure = (
|
||||
epc_base.filter(
|
||||
pl.col("TENURE").str.to_lowercase().str.contains("social")
|
||||
)
|
||||
epc_base.filter(pl.col("TENURE").str.to_lowercase().str.contains("social"))
|
||||
.select("epc_address", "POSTCODE")
|
||||
.unique()
|
||||
.with_columns(pl.lit("Yes").alias("was_council_house"))
|
||||
|
|
@ -139,16 +137,20 @@ def main():
|
|||
print(f"Former council houses (EPC social tenure): {social_tenure.height}")
|
||||
|
||||
# Left-join events and social tenure back onto dedup EPC
|
||||
epc = epc.join(
|
||||
events.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
).join(
|
||||
social_tenure.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
).with_columns(
|
||||
pl.col("was_council_house").fill_null("No"),
|
||||
epc = (
|
||||
epc.join(
|
||||
events.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
)
|
||||
.join(
|
||||
social_tenure.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("was_council_house").fill_null("No"),
|
||||
)
|
||||
)
|
||||
|
||||
print("EPC dataset")
|
||||
|
|
|
|||
|
|
@ -1092,6 +1092,7 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
|
|||
"Asda Living": "Asda Living",
|
||||
"Asda PFS": "Asda PFS",
|
||||
"Cooltrader": "Heron Foods",
|
||||
"Co-op Food": "Co-op",
|
||||
"Cook": "COOK",
|
||||
"Eurospar": "Spar",
|
||||
"Eurospar PFS": "Spar",
|
||||
|
|
@ -1144,9 +1145,7 @@ def transform_grocery_retail_points(
|
|||
required = {"id", "retailer", "fascia", "store_name", "long_wgs", "lat_wgs"}
|
||||
missing = required - set(grocery_df.columns)
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"GEOLYTIX retail points missing columns: {sorted(missing)}"
|
||||
)
|
||||
raise ValueError(f"GEOLYTIX retail points missing columns: {sorted(missing)}")
|
||||
|
||||
df = (
|
||||
grocery_df.select(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue