All good
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled

This commit is contained in:
Andras Schmelczer 2026-05-18 21:20:10 +01:00
parent 6ea544a0f6
commit 6cc7288126
45 changed files with 929 additions and 1043 deletions

View file

@ -14,12 +14,7 @@ from constants import (
LONDON_OUTCODE_PREFIXES,
)
from homecouk import CookiesExpiredError
from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
from homecouk import search_outcode as homecouk_search_outcode
from http_client import make_client
from listing_filters import matches_strict_buy_listing_filter
from rightmove import resolve_outcode_id
from rightmove import search_outcode as rightmove_search_outcode
from spatial import PostcodeSpatialIndex
@ -30,7 +25,7 @@ from zoopla import search_outcode as zoopla_search_outcode
log = logging.getLogger("rightmove")
SOURCE_ORDER = ("rightmove", "homecouk", "zoopla")
SOURCE_ORDER = ("rightmove", "zoopla")
SALE_CHANNEL = CHANNELS[0]
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
@ -260,16 +255,7 @@ def _store_properties(
dropped_outside_area,
)
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
dropped_non_matching = len(londonish) - len(eligible)
if dropped_non_matching:
log.debug(
"%s dropped %d properties outside the strict buy-listing filters",
source,
dropped_non_matching,
)
selected = eligible if remaining is None else eligible[:remaining]
selected = londonish if remaining is None else londonish[:remaining]
results[source].extend(selected)
return len(selected)
@ -290,6 +276,8 @@ def _launch_zoopla_with_retries(attempts: int = 3):
for attempt in range(1, attempts + 1):
try:
return launch_zoopla_browser()
except TurnstileError:
raise
except Exception as exc:
last_error = exc
log.warning(
@ -304,13 +292,6 @@ def _launch_zoopla_with_retries(attempts: int = 3):
raise last_error
def _new_homecouk_client():
cookie_data = load_homecouk_cookies()
if not cookie_data:
return None
return make_homecouk_client(*cookie_data)
def _scrape_rightmove(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -368,74 +349,6 @@ def _scrape_rightmove(
client.close()
def _scrape_homecouk(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
results: dict[str, list[dict]],
errors: list[str],
max_properties_per_source: int | None,
) -> None:
client = _new_homecouk_client()
if client is None:
log.warning("home.co.uk skipped: could not bootstrap a local session")
return
try:
for outcode in outcodes:
if _source_remaining(results, "homecouk", max_properties_per_source) == 0:
log.info("home.co.uk cap reached")
return
for attempt in range(2):
try:
# home.co.uk cannot express the full filter set at source.
# Fetch the outcode page set first; _store_properties applies
# the strict filter and source cap after transformation.
props = homecouk_search_outcode(
client,
outcode,
pc_index,
max_properties=None,
)
added = _store_properties(
results,
"homecouk",
props,
max_properties_per_source,
)
log.info("home.co.uk %s: +%d", outcode, added)
break
except CookiesExpiredError as exc:
if attempt == 1:
_record_error(errors, "homecouk", outcode, exc)
break
log.warning(
"home.co.uk cookies expired at %s; refreshing local session",
outcode,
)
try:
client.close()
except Exception:
pass
client = _new_homecouk_client()
if client is None:
_record_error(
errors,
"homecouk",
outcode,
RuntimeError("could not refresh local session"),
)
return
except Exception as exc:
_record_error(errors, "homecouk", outcode, exc)
break
time.sleep(DELAY_BETWEEN_OUTCODES)
finally:
client.close()
def _scrape_zoopla(
outcodes: list[str],
pc_index: PostcodeSpatialIndex,
@ -459,9 +372,8 @@ def _scrape_zoopla(
for attempt in range(2):
try:
# Zoopla source-side filters are unverified here. Fetch the
# outcode page set first; _store_properties applies the
# strict filter and source cap after transformation.
# Fetch the outcode page set first; _store_properties applies
# the London-ish postcode filter and source cap after transformation.
props, _ = zoopla_search_outcode(
page,
outcode,
@ -539,15 +451,6 @@ def run_scrape(
max_properties_per_source,
)
if "homecouk" in selected_sources:
_scrape_homecouk(
selected_outcodes,
pc_index,
results,
errors,
max_properties_per_source,
)
if "zoopla" in selected_sources:
if pc_coords is None:
pc_coords = build_postcode_coords()
@ -567,20 +470,10 @@ def run_scrape(
else:
if output_path.exists():
output_path.unlink()
log.warning("No strict properties to write to %s", output_path)
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
if filtered:
write_parquet(filtered, filtered_output_path)
else:
if filtered_output_path.exists():
filtered_output_path.unlink()
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
log.warning("No London-ish properties to write to %s", output_path)
counts = {
"total": len(merged),
"filtered_total": len(filtered),
"deduped": deduped,
"sources": source_counts,
}
@ -588,9 +481,8 @@ def run_scrape(
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
)
log.info(
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
"Sale scrape complete: %d unique (%s deduped:%d)",
len(merged),
len(filtered),
source_summary,
deduped,
)
@ -603,7 +495,6 @@ def run_scrape(
},
"counts": counts,
"path": str(output_path),
"filtered_path": str(filtered_output_path),
"errors": errors,
"elapsed_seconds": round(time.time() - started_at, 3),
}