scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -57,6 +57,16 @@ def parse_args() -> argparse.Namespace:
default=DATA_DIR,
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
)
parser.add_argument(
"--outcodes",
type=str,
default=None,
help=(
"Comma-separated outcodes to scrape (e.g. 'SW9' or 'SW9,E14,BR1') "
"instead of the full Greater London set. Must fall within the "
"London-ish areas; takes precedence over --test/--limit-outcodes."
),
)
parser.add_argument(
"--limit-outcodes",
type=int,
@ -116,17 +126,32 @@ def main() -> int:
from scraper import (
build_postcode_coords,
build_postcode_index,
filter_londonish_outcodes,
load_outcodes,
run_scrape,
)
outcodes = load_outcodes()
if args.test and args.limit_outcodes is None:
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
if preferred:
outcodes = preferred
if args.limit_outcodes is not None:
outcodes = outcodes[: args.limit_outcodes]
if args.outcodes is not None:
requested = [code.strip().upper() for code in args.outcodes.split(",") if code.strip()]
if not requested:
raise SystemExit("--outcodes was empty")
outcodes = filter_londonish_outcodes(requested)
dropped = sorted(set(requested) - set(outcodes))
if dropped:
log.warning("Ignoring outcodes outside the Greater London-ish areas: %s", ", ".join(dropped))
if not outcodes:
raise SystemExit(
"None of the requested outcodes are within the Greater London-ish areas "
f"({', '.join(requested)})."
)
else:
outcodes = load_outcodes()
if args.test and args.limit_outcodes is None:
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
if preferred:
outcodes = preferred
if args.limit_outcodes is not None:
outcodes = outcodes[: args.limit_outcodes]
if not outcodes:
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")