import argparse import logging import os import tempfile import time from pathlib import Path from constants import DATA_DIR, REPO_DIR SOURCE_CHOICES = ("rightmove", "zoopla", "all") TEST_MAX_PROPERTIES_PER_SOURCE = 100 TEST_OUTCODES = ( "E1", "N1", "NW1", "SE1", "SW1", "W1", "WC1", "BR1", "CR0", "TW1", ) log = logging.getLogger("finder") def configure_standalone_runtime() -> None: """Keep browser/cache/temp files on the project volume for local runs.""" runtime_dir = REPO_DIR / ".tmp" / "finder" cache_dir = runtime_dir / "cache" temp_dir = runtime_dir / "tmp" cache_dir.mkdir(parents=True, exist_ok=True) temp_dir.mkdir(parents=True, exist_ok=True) os.environ["XDG_CACHE_HOME"] = str(cache_dir) os.environ["TMPDIR"] = str(temp_dir) os.environ["TEMP"] = str(temp_dir) os.environ["TMP"] = str(temp_dir) tempfile.tempdir = str(temp_dir) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Run a manual Greater London-ish property scrape." ) parser.add_argument( "--source", choices=SOURCE_CHOICES, default="all", help="Portal to scrape. 'all' runs Rightmove and Zoopla.", ) parser.add_argument( "--output-dir", type=Path, default=DATA_DIR, help=f"Directory for parquet output. Defaults to {DATA_DIR}.", ) parser.add_argument( "--limit-outcodes", type=int, default=None, help="Limit outcodes for a quick manual smoke test.", ) parser.add_argument( "--max-properties-per-source", type=int, default=None, help="Stop each source after this many transformed listings.", ) parser.add_argument( "--test", action="store_true", help=( "Run a small standalone smoke test: use likely London outcodes and " f"fetch at most {TEST_MAX_PROPERTIES_PER_SOURCE} listings per source." ), ) return parser.parse_args() def configure_logging() -> None: logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) def selected_sources(source: str) -> list[str]: if source == "all": return ["rightmove", "zoopla"] return [source] def main() -> int: args = parse_args() configure_standalone_runtime() configure_logging() if args.limit_outcodes is not None and args.limit_outcodes < 1: raise SystemExit("--limit-outcodes must be greater than zero") if ( args.max_properties_per_source is not None and args.max_properties_per_source < 1 ): raise SystemExit("--max-properties-per-source must be greater than zero") output_dir = args.output_dir.expanduser().resolve() if args.test and args.output_dir == DATA_DIR: output_dir = (DATA_DIR / "test").expanduser().resolve() output_dir.mkdir(parents=True, exist_ok=True) from scraper import ( build_postcode_coords, build_postcode_index, load_outcodes, run_scrape, ) outcodes = load_outcodes() if args.test and args.limit_outcodes is None: preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)] if preferred: outcodes = preferred if args.limit_outcodes is not None: outcodes = outcodes[: args.limit_outcodes] if not outcodes: raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.") sources = selected_sources(args.source) max_properties_per_source = args.max_properties_per_source if args.test and max_properties_per_source is None: max_properties_per_source = TEST_MAX_PROPERTIES_PER_SOURCE log.info( "Starting sale scrape: source=%s outcodes=%d output_dir=%s test=%s", args.source, len(outcodes), output_dir, args.test, ) started = time.monotonic() pc_index = build_postcode_index() pc_coords = build_postcode_coords() if "zoopla" in sources else None result = run_scrape( outcodes, pc_index, pc_coords=pc_coords, sources=sources, output_dir=output_dir, max_properties_per_source=max_properties_per_source, ) elapsed = time.monotonic() - started log.info("Scrape finished in %.1fs", elapsed) log.info("Result: %s", result) if args.test and result.get("errors"): raise SystemExit("Test scrape failed; see errors in the result above.") return 0 if __name__ == "__main__": raise SystemExit(main())