All good
This commit is contained in:
parent
6ea544a0f6
commit
6cc7288126
45 changed files with 929 additions and 1043 deletions
|
|
@ -14,12 +14,7 @@ from constants import (
|
|||
LONDON_OUTCODE_PREFIXES,
|
||||
)
|
||||
|
||||
from homecouk import CookiesExpiredError
|
||||
from homecouk import load_cookies as load_homecouk_cookies
|
||||
from homecouk import make_client as make_homecouk_client
|
||||
from homecouk import search_outcode as homecouk_search_outcode
|
||||
from http_client import make_client
|
||||
from listing_filters import matches_strict_buy_listing_filter
|
||||
from rightmove import resolve_outcode_id
|
||||
from rightmove import search_outcode as rightmove_search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
|
@ -30,7 +25,7 @@ from zoopla import search_outcode as zoopla_search_outcode
|
|||
|
||||
log = logging.getLogger("rightmove")
|
||||
|
||||
SOURCE_ORDER = ("rightmove", "homecouk", "zoopla")
|
||||
SOURCE_ORDER = ("rightmove", "zoopla")
|
||||
SALE_CHANNEL = CHANNELS[0]
|
||||
LONDON_AREAS = sorted({prefix.upper() for prefix in LONDON_OUTCODE_PREFIXES})
|
||||
OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z0-9]?)")
|
||||
|
|
@ -260,16 +255,7 @@ def _store_properties(
|
|||
dropped_outside_area,
|
||||
)
|
||||
|
||||
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
|
||||
dropped_non_matching = len(londonish) - len(eligible)
|
||||
if dropped_non_matching:
|
||||
log.debug(
|
||||
"%s dropped %d properties outside the strict buy-listing filters",
|
||||
source,
|
||||
dropped_non_matching,
|
||||
)
|
||||
|
||||
selected = eligible if remaining is None else eligible[:remaining]
|
||||
selected = londonish if remaining is None else londonish[:remaining]
|
||||
results[source].extend(selected)
|
||||
return len(selected)
|
||||
|
||||
|
|
@ -290,6 +276,8 @@ def _launch_zoopla_with_retries(attempts: int = 3):
|
|||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
return launch_zoopla_browser()
|
||||
except TurnstileError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
log.warning(
|
||||
|
|
@ -304,13 +292,6 @@ def _launch_zoopla_with_retries(attempts: int = 3):
|
|||
raise last_error
|
||||
|
||||
|
||||
def _new_homecouk_client():
|
||||
cookie_data = load_homecouk_cookies()
|
||||
if not cookie_data:
|
||||
return None
|
||||
return make_homecouk_client(*cookie_data)
|
||||
|
||||
|
||||
def _scrape_rightmove(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -368,74 +349,6 @@ def _scrape_rightmove(
|
|||
client.close()
|
||||
|
||||
|
||||
def _scrape_homecouk(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
results: dict[str, list[dict]],
|
||||
errors: list[str],
|
||||
max_properties_per_source: int | None,
|
||||
) -> None:
|
||||
client = _new_homecouk_client()
|
||||
if client is None:
|
||||
log.warning("home.co.uk skipped: could not bootstrap a local session")
|
||||
return
|
||||
|
||||
try:
|
||||
for outcode in outcodes:
|
||||
if _source_remaining(results, "homecouk", max_properties_per_source) == 0:
|
||||
log.info("home.co.uk cap reached")
|
||||
return
|
||||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# home.co.uk cannot express the full filter set at source.
|
||||
# Fetch the outcode page set first; _store_properties applies
|
||||
# the strict filter and source cap after transformation.
|
||||
props = homecouk_search_outcode(
|
||||
client,
|
||||
outcode,
|
||||
pc_index,
|
||||
max_properties=None,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
"homecouk",
|
||||
props,
|
||||
max_properties_per_source,
|
||||
)
|
||||
log.info("home.co.uk %s: +%d", outcode, added)
|
||||
break
|
||||
except CookiesExpiredError as exc:
|
||||
if attempt == 1:
|
||||
_record_error(errors, "homecouk", outcode, exc)
|
||||
break
|
||||
|
||||
log.warning(
|
||||
"home.co.uk cookies expired at %s; refreshing local session",
|
||||
outcode,
|
||||
)
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass
|
||||
client = _new_homecouk_client()
|
||||
if client is None:
|
||||
_record_error(
|
||||
errors,
|
||||
"homecouk",
|
||||
outcode,
|
||||
RuntimeError("could not refresh local session"),
|
||||
)
|
||||
return
|
||||
except Exception as exc:
|
||||
_record_error(errors, "homecouk", outcode, exc)
|
||||
break
|
||||
|
||||
time.sleep(DELAY_BETWEEN_OUTCODES)
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
|
||||
def _scrape_zoopla(
|
||||
outcodes: list[str],
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
|
|
@ -459,9 +372,8 @@ def _scrape_zoopla(
|
|||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# Zoopla source-side filters are unverified here. Fetch the
|
||||
# outcode page set first; _store_properties applies the
|
||||
# strict filter and source cap after transformation.
|
||||
# Fetch the outcode page set first; _store_properties applies
|
||||
# the London-ish postcode filter and source cap after transformation.
|
||||
props, _ = zoopla_search_outcode(
|
||||
page,
|
||||
outcode,
|
||||
|
|
@ -539,15 +451,6 @@ def run_scrape(
|
|||
max_properties_per_source,
|
||||
)
|
||||
|
||||
if "homecouk" in selected_sources:
|
||||
_scrape_homecouk(
|
||||
selected_outcodes,
|
||||
pc_index,
|
||||
results,
|
||||
errors,
|
||||
max_properties_per_source,
|
||||
)
|
||||
|
||||
if "zoopla" in selected_sources:
|
||||
if pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
|
|
@ -567,20 +470,10 @@ def run_scrape(
|
|||
else:
|
||||
if output_path.exists():
|
||||
output_path.unlink()
|
||||
log.warning("No strict properties to write to %s", output_path)
|
||||
|
||||
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
|
||||
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
|
||||
if filtered:
|
||||
write_parquet(filtered, filtered_output_path)
|
||||
else:
|
||||
if filtered_output_path.exists():
|
||||
filtered_output_path.unlink()
|
||||
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
|
||||
log.warning("No London-ish properties to write to %s", output_path)
|
||||
|
||||
counts = {
|
||||
"total": len(merged),
|
||||
"filtered_total": len(filtered),
|
||||
"deduped": deduped,
|
||||
"sources": source_counts,
|
||||
}
|
||||
|
|
@ -588,9 +481,8 @@ def run_scrape(
|
|||
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
|
||||
)
|
||||
log.info(
|
||||
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
|
||||
"Sale scrape complete: %d unique (%s deduped:%d)",
|
||||
len(merged),
|
||||
len(filtered),
|
||||
source_summary,
|
||||
deduped,
|
||||
)
|
||||
|
|
@ -603,7 +495,6 @@ def run_scrape(
|
|||
},
|
||||
"counts": counts,
|
||||
"path": str(output_path),
|
||||
"filtered_path": str(filtered_output_path),
|
||||
"errors": errors,
|
||||
"elapsed_seconds": round(time.time() - started_at, 3),
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue