perfect-postcode/finder/main.py
2026-05-17 10:16:30 +01:00

166 lines
4.6 KiB
Python

import argparse
import logging
import os
import tempfile
import time
from pathlib import Path
from constants import DATA_DIR
SOURCE_CHOICES = ("rightmove", "homecouk", "zoopla", "all")
TEST_MAX_PROPERTIES_PER_SOURCE = 100
TEST_OUTCODES = (
"E1",
"N1",
"NW1",
"SE1",
"SW1",
"W1",
"WC1",
"BR1",
"CR0",
"TW1",
)
log = logging.getLogger("finder")
def configure_standalone_runtime() -> None:
"""Keep browser/cache/temp files on the project volume for local runs."""
runtime_dir = DATA_DIR / ".runtime"
cache_dir = runtime_dir / "cache"
temp_dir = runtime_dir / "tmp"
cache_dir.mkdir(parents=True, exist_ok=True)
temp_dir.mkdir(parents=True, exist_ok=True)
os.environ.setdefault("XDG_CACHE_HOME", str(cache_dir))
os.environ.setdefault("TMPDIR", str(temp_dir))
tempfile.tempdir = str(temp_dir)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run a manual Greater London-ish property scrape."
)
parser.add_argument(
"--source",
choices=SOURCE_CHOICES,
default="all",
help="Portal to scrape. 'all' runs Rightmove, home.co.uk, and Zoopla.",
)
parser.add_argument(
"--output-dir",
type=Path,
default=DATA_DIR,
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
)
parser.add_argument(
"--limit-outcodes",
type=int,
default=None,
help="Limit outcodes for a quick manual smoke test.",
)
parser.add_argument(
"--max-properties-per-source",
type=int,
default=None,
help="Stop each source after this many transformed listings.",
)
parser.add_argument(
"--test",
action="store_true",
help=(
"Run a small standalone smoke test: use likely London outcodes and "
f"fetch at most {TEST_MAX_PROPERTIES_PER_SOURCE} listings per source."
),
)
return parser.parse_args()
def configure_logging() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
def selected_sources(source: str) -> list[str]:
if source == "all":
return ["rightmove", "homecouk", "zoopla"]
return [source]
def main() -> int:
args = parse_args()
configure_standalone_runtime()
configure_logging()
if args.limit_outcodes is not None and args.limit_outcodes < 1:
raise SystemExit("--limit-outcodes must be greater than zero")
if (
args.max_properties_per_source is not None
and args.max_properties_per_source < 1
):
raise SystemExit("--max-properties-per-source must be greater than zero")
output_dir = args.output_dir.expanduser().resolve()
if args.test and args.output_dir == DATA_DIR:
output_dir = (DATA_DIR / "test").expanduser().resolve()
output_dir.mkdir(parents=True, exist_ok=True)
from scraper import (
build_postcode_coords,
build_postcode_index,
load_outcodes,
run_scrape,
)
outcodes = load_outcodes()
if args.test and args.limit_outcodes is None:
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
if preferred:
outcodes = preferred
if args.limit_outcodes is not None:
outcodes = outcodes[: args.limit_outcodes]
if not outcodes:
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
sources = selected_sources(args.source)
max_properties_per_source = args.max_properties_per_source
if args.test and max_properties_per_source is None:
max_properties_per_source = TEST_MAX_PROPERTIES_PER_SOURCE
log.info(
"Starting sale scrape: source=%s outcodes=%d output_dir=%s test=%s",
args.source,
len(outcodes),
output_dir,
args.test,
)
started = time.monotonic()
pc_index = build_postcode_index()
pc_coords = build_postcode_coords() if "zoopla" in sources else None
result = run_scrape(
outcodes,
pc_index,
pc_coords=pc_coords,
sources=sources,
output_dir=output_dir,
max_properties_per_source=max_properties_per_source,
)
elapsed = time.monotonic() - started
log.info("Scrape finished in %.1fs", elapsed)
log.info("Result: %s", result)
if args.test and result.get("errors"):
raise SystemExit("Test scrape failed; see errors in the result above.")
return 0
if __name__ == "__main__":
raise SystemExit(main())