Fix scrape

This commit is contained in:
Andras Schmelczer 2026-03-26 07:54:39 +00:00
parent bbc2fcb86c
commit 3adbaf435d
2 changed files with 131 additions and 27 deletions

View file

@ -18,6 +18,17 @@ log = logging.getLogger("rightmove")
# Outcode ID cache (Rightmove typeahead → internal ID)
outcode_cache: dict[str, str] = {}
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
# Requesting index >= 1008 returns HTTP 400.
_MAX_INDEX = 1008
# Property type filters for splitting overcapped searches. Each sub-query
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
_PROPERTY_TYPES = [
"detached", "semi-detached", "terraced", "flat",
"bungalow", "park-home", "land",
]
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
@ -40,16 +51,18 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
return None
def search_outcode(
def _paginate(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
extra_params: dict | None = None,
) -> tuple[list[dict], int]:
"""Paginate through search results. Returns (properties, result_count)."""
properties = []
index = 0
result_count = 0
while True:
params = {
@ -60,6 +73,8 @@ def search_outcode(
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
if extra_params:
params.update(extra_params)
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
@ -90,4 +105,52 @@ def search_outcode(
time.sleep(DELAY_BETWEEN_PAGES)
return properties
return properties, result_count
def search_outcode(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties.
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
re-queries per property type to recover listings beyond the cap.
"""
properties, result_count = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index
)
if result_count <= _MAX_INDEX:
return properties
# Hit the 1008 cap — re-search per property type to get full coverage
ch = channel_cfg["channel"]
log.info(
"%s/%s: %d results exceed %d cap, splitting by property type",
outcode, ch, result_count, _MAX_INDEX,
)
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
for pt in _PROPERTY_TYPES:
pt_props, _ = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index,
extra_params={"propertyTypes": pt},
)
new = 0
for p in pt_props:
if p["id"] not in all_by_id:
all_by_id[p["id"]] = p
new += 1
if new:
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
log.info(
"%s/%s: type split recovered %d%d properties",
outcode, ch, len(properties), len(all_by_id),
)
return list(all_by_id.values())