all is well
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 7m0s
CI / Check (push) Failing after 7m9s

This commit is contained in:
Andras Schmelczer 2026-05-17 17:20:19 +01:00
parent eac1bd0d13
commit 2f149503bb
53 changed files with 1543 additions and 354 deletions

View file

@ -19,6 +19,7 @@ from homecouk import load_cookies as load_homecouk_cookies
from homecouk import make_client as make_homecouk_client
from homecouk import search_outcode as homecouk_search_outcode
from http_client import make_client
from listing_filters import matches_strict_buy_listing_filter
from rightmove import resolve_outcode_id
from rightmove import search_outcode as rightmove_search_outcode
from spatial import PostcodeSpatialIndex
@ -181,11 +182,11 @@ def _source_names(sources: str | Iterable[str] | None) -> list[str]:
requested = [str(source).strip().lower() for source in sources]
requested = [source for source in requested if source]
if "all" in requested:
return list(SOURCE_ORDER)
unknown = sorted(set(requested) - set(SOURCE_ORDER))
unknown = sorted(set(requested) - set(SOURCE_ORDER) - {"all"})
if unknown:
raise ValueError(f"Unknown source(s): {', '.join(unknown)}")
if "all" in requested:
return list(SOURCE_ORDER)
return [source for source in SOURCE_ORDER if source in requested]
@ -196,19 +197,28 @@ def _dedup_key(prop: dict) -> tuple:
def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]:
merged: dict[str, dict] = {}
seen_keys: set[tuple] = set()
seen_ids: set[str] = set()
counts = {source: 0 for source in SOURCE_ORDER}
deduped = 0
for source in SOURCE_ORDER:
for prop in source_results.get(source, []):
prop_id = prop.get("id")
key = _dedup_key(prop)
if (prop_id is not None and prop_id in merged) or key in seen_keys:
deduped += 1
continue
storage_key = prop_id if prop_id is not None else f"{source}:{len(merged)}"
if prop_id is not None:
prop_id = str(prop_id)
if prop_id in seen_ids:
deduped += 1
continue
seen_ids.add(prop_id)
storage_key = prop_id
else:
key = _dedup_key(prop)
if key in seen_keys:
deduped += 1
continue
seen_keys.add(key)
storage_key = f"{source}:{len(merged)}"
merged[storage_key] = prop
seen_keys.add(key)
counts[source] += 1
return list(merged.values()), counts, deduped
@ -241,13 +251,22 @@ def _store_properties(
if remaining == 0:
return 0
eligible = [prop for prop in props if _property_is_londonish(prop)]
dropped = len(props) - len(eligible)
if dropped:
londonish = [prop for prop in props if _property_is_londonish(prop)]
dropped_outside_area = len(props) - len(londonish)
if dropped_outside_area:
log.debug(
"%s dropped %d properties outside the Greater London-ish postcode filter",
source,
dropped,
dropped_outside_area,
)
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
dropped_non_matching = len(londonish) - len(eligible)
if dropped_non_matching:
log.debug(
"%s dropped %d properties outside the strict buy-listing filters",
source,
dropped_non_matching,
)
selected = eligible if remaining is None else eligible[:remaining]
@ -367,20 +386,16 @@ def _scrape_homecouk(
log.info("home.co.uk cap reached")
return
remaining = _source_remaining(
results, "homecouk", max_properties_per_source
)
if remaining == 0:
log.info("home.co.uk cap reached")
return
for attempt in range(2):
try:
# home.co.uk cannot express the full filter set at source.
# Fetch the outcode page set first; _store_properties applies
# the strict filter and source cap after transformation.
props = homecouk_search_outcode(
client,
outcode,
pc_index,
max_properties=remaining,
max_properties=None,
)
added = _store_properties(
results,
@ -442,19 +457,17 @@ def _scrape_zoopla(
log.info("Zoopla cap reached")
return
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
if remaining == 0:
log.info("Zoopla cap reached")
return
for attempt in range(2):
try:
# Zoopla source-side filters are unverified here. Fetch the
# outcode page set first; _store_properties applies the
# strict filter and source cap after transformation.
props, _ = zoopla_search_outcode(
page,
outcode,
pc_index,
pc_coords,
max_properties=remaining,
max_properties=None,
)
added = _store_properties(
results,
@ -506,9 +519,6 @@ def run_scrape(
output_base = Path(output_dir) if output_dir is not None else DATA_DIR
output_base.mkdir(parents=True, exist_ok=True)
if "zoopla" in selected_sources and pc_coords is None:
pc_coords = build_postcode_coords()
errors: list[str] = []
results = {source: [] for source in SOURCE_ORDER}
started_at = time.time()
@ -539,7 +549,8 @@ def run_scrape(
)
if "zoopla" in selected_sources:
assert pc_coords is not None
if pc_coords is None:
pc_coords = build_postcode_coords()
_scrape_zoopla(
selected_outcodes,
pc_index,
@ -551,19 +562,36 @@ def run_scrape(
merged, source_counts, deduped = _merge_properties(results)
output_path = output_base / "online_listings_buy.parquet"
write_parquet(merged, output_path)
if merged:
write_parquet(merged, output_path)
else:
if output_path.exists():
output_path.unlink()
log.warning("No strict properties to write to %s", output_path)
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
if filtered:
write_parquet(filtered, filtered_output_path)
else:
if filtered_output_path.exists():
filtered_output_path.unlink()
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
counts = {
"total": len(merged),
"filtered_total": len(filtered),
"deduped": deduped,
"sources": source_counts,
}
source_summary = " ".join(
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
)
log.info(
"Sale scrape complete: %d unique (rightmove:%d homecouk:%d zoopla:%d deduped:%d)",
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
len(merged),
source_counts["rightmove"],
source_counts["homecouk"],
source_counts["zoopla"],
len(filtered),
source_summary,
deduped,
)
@ -575,6 +603,7 @@ def run_scrape(
},
"counts": counts,
"path": str(output_path),
"filtered_path": str(filtered_output_path),
"errors": errors,
"elapsed_seconds": round(time.time() - started_at, 3),
}