Fix scrape
This commit is contained in:
parent
bbc2fcb86c
commit
3adbaf435d
2 changed files with 131 additions and 27 deletions
|
|
@ -18,6 +18,17 @@ log = logging.getLogger("rightmove")
|
|||
# Outcode ID cache (Rightmove typeahead → internal ID)
|
||||
outcode_cache: dict[str, str] = {}
|
||||
|
||||
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
|
||||
# Requesting index >= 1008 returns HTTP 400.
|
||||
_MAX_INDEX = 1008
|
||||
|
||||
# Property type filters for splitting overcapped searches. Each sub-query
|
||||
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
|
||||
_PROPERTY_TYPES = [
|
||||
"detached", "semi-detached", "terraced", "flat",
|
||||
"bungalow", "park-home", "land",
|
||||
]
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||
|
|
@ -40,16 +51,18 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
|||
return None
|
||||
|
||||
|
||||
def search_outcode(
|
||||
def _paginate(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
|
||||
extra_params: dict | None = None,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Paginate through search results. Returns (properties, result_count)."""
|
||||
properties = []
|
||||
index = 0
|
||||
result_count = 0
|
||||
|
||||
while True:
|
||||
params = {
|
||||
|
|
@ -60,6 +73,8 @@ def search_outcode(
|
|||
"channel": channel_cfg["channel"],
|
||||
"transactionType": channel_cfg["transactionType"],
|
||||
}
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
|
||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||
if not data:
|
||||
|
|
@ -90,4 +105,52 @@ def search_outcode(
|
|||
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return properties
|
||||
return properties, result_count
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties.
|
||||
|
||||
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
|
||||
re-queries per property type to recover listings beyond the cap.
|
||||
"""
|
||||
properties, result_count = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index
|
||||
)
|
||||
|
||||
if result_count <= _MAX_INDEX:
|
||||
return properties
|
||||
|
||||
# Hit the 1008 cap — re-search per property type to get full coverage
|
||||
ch = channel_cfg["channel"]
|
||||
log.info(
|
||||
"%s/%s: %d results exceed %d cap, splitting by property type",
|
||||
outcode, ch, result_count, _MAX_INDEX,
|
||||
)
|
||||
|
||||
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
|
||||
|
||||
for pt in _PROPERTY_TYPES:
|
||||
pt_props, _ = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index,
|
||||
extra_params={"propertyTypes": pt},
|
||||
)
|
||||
new = 0
|
||||
for p in pt_props:
|
||||
if p["id"] not in all_by_id:
|
||||
all_by_id[p["id"]] = p
|
||||
new += 1
|
||||
if new:
|
||||
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
|
||||
|
||||
log.info(
|
||||
"%s/%s: type split recovered %d → %d properties",
|
||||
outcode, ch, len(properties), len(all_by_id),
|
||||
)
|
||||
return list(all_by_id.values())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue