168 lines
4.7 KiB
Python
168 lines
4.7 KiB
Python
import argparse
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from constants import DATA_DIR, REPO_DIR
|
|
|
|
|
|
SOURCE_CHOICES = ("rightmove", "zoopla", "all")
|
|
TEST_MAX_PROPERTIES_PER_SOURCE = 100
|
|
TEST_OUTCODES = (
|
|
"E1",
|
|
"N1",
|
|
"NW1",
|
|
"SE1",
|
|
"SW1",
|
|
"W1",
|
|
"WC1",
|
|
"BR1",
|
|
"CR0",
|
|
"TW1",
|
|
)
|
|
|
|
log = logging.getLogger("finder")
|
|
|
|
|
|
def configure_standalone_runtime() -> None:
|
|
"""Keep browser/cache/temp files on the project volume for local runs."""
|
|
runtime_dir = REPO_DIR / ".tmp" / "finder"
|
|
cache_dir = runtime_dir / "cache"
|
|
temp_dir = runtime_dir / "tmp"
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
os.environ["XDG_CACHE_HOME"] = str(cache_dir)
|
|
os.environ["TMPDIR"] = str(temp_dir)
|
|
os.environ["TEMP"] = str(temp_dir)
|
|
os.environ["TMP"] = str(temp_dir)
|
|
tempfile.tempdir = str(temp_dir)
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Run a manual Greater London-ish property scrape."
|
|
)
|
|
parser.add_argument(
|
|
"--source",
|
|
choices=SOURCE_CHOICES,
|
|
default="all",
|
|
help="Portal to scrape. 'all' runs Rightmove and Zoopla.",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=Path,
|
|
default=DATA_DIR,
|
|
help=f"Directory for parquet output. Defaults to {DATA_DIR}.",
|
|
)
|
|
parser.add_argument(
|
|
"--limit-outcodes",
|
|
type=int,
|
|
default=None,
|
|
help="Limit outcodes for a quick manual smoke test.",
|
|
)
|
|
parser.add_argument(
|
|
"--max-properties-per-source",
|
|
type=int,
|
|
default=None,
|
|
help="Stop each source after this many transformed listings.",
|
|
)
|
|
parser.add_argument(
|
|
"--test",
|
|
action="store_true",
|
|
help=(
|
|
"Run a small standalone smoke test: use likely London outcodes and "
|
|
f"fetch at most {TEST_MAX_PROPERTIES_PER_SOURCE} listings per source."
|
|
),
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def configure_logging() -> None:
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
)
|
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
|
|
|
|
def selected_sources(source: str) -> list[str]:
|
|
if source == "all":
|
|
return ["rightmove", "zoopla"]
|
|
return [source]
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
configure_standalone_runtime()
|
|
configure_logging()
|
|
|
|
if args.limit_outcodes is not None and args.limit_outcodes < 1:
|
|
raise SystemExit("--limit-outcodes must be greater than zero")
|
|
if (
|
|
args.max_properties_per_source is not None
|
|
and args.max_properties_per_source < 1
|
|
):
|
|
raise SystemExit("--max-properties-per-source must be greater than zero")
|
|
|
|
output_dir = args.output_dir.expanduser().resolve()
|
|
if args.test and args.output_dir == DATA_DIR:
|
|
output_dir = (DATA_DIR / "test").expanduser().resolve()
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
from scraper import (
|
|
build_postcode_coords,
|
|
build_postcode_index,
|
|
load_outcodes,
|
|
run_scrape,
|
|
)
|
|
|
|
outcodes = load_outcodes()
|
|
if args.test and args.limit_outcodes is None:
|
|
preferred = [outcode for outcode in TEST_OUTCODES if outcode in set(outcodes)]
|
|
if preferred:
|
|
outcodes = preferred
|
|
if args.limit_outcodes is not None:
|
|
outcodes = outcodes[: args.limit_outcodes]
|
|
|
|
if not outcodes:
|
|
raise SystemExit("No Greater London-ish outcodes loaded; nothing to scrape.")
|
|
|
|
sources = selected_sources(args.source)
|
|
max_properties_per_source = args.max_properties_per_source
|
|
if args.test and max_properties_per_source is None:
|
|
max_properties_per_source = TEST_MAX_PROPERTIES_PER_SOURCE
|
|
|
|
log.info(
|
|
"Starting sale scrape: source=%s outcodes=%d output_dir=%s test=%s",
|
|
args.source,
|
|
len(outcodes),
|
|
output_dir,
|
|
args.test,
|
|
)
|
|
started = time.monotonic()
|
|
|
|
pc_index = build_postcode_index()
|
|
pc_coords = build_postcode_coords() if "zoopla" in sources else None
|
|
result = run_scrape(
|
|
outcodes,
|
|
pc_index,
|
|
pc_coords=pc_coords,
|
|
sources=sources,
|
|
output_dir=output_dir,
|
|
max_properties_per_source=max_properties_per_source,
|
|
)
|
|
|
|
elapsed = time.monotonic() - started
|
|
log.info("Scrape finished in %.1fs", elapsed)
|
|
log.info("Result: %s", result)
|
|
if args.test and result.get("errors"):
|
|
raise SystemExit("Test scrape failed; see errors in the result above.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|