156 lines
4.5 KiB
Python
156 lines
4.5 KiB
Python
import logging
|
||
import time
|
||
|
||
import httpx
|
||
|
||
from constants import (
|
||
PAGE_SIZE,
|
||
DELAY_BETWEEN_PAGES,
|
||
SEARCH_URL,
|
||
TYPEAHEAD_URL,
|
||
)
|
||
from http_client import fetch_with_retry
|
||
from spatial import PostcodeSpatialIndex
|
||
from transform import transform_property
|
||
|
||
log = logging.getLogger("rightmove")
|
||
|
||
# Outcode ID cache (Rightmove typeahead → internal ID)
|
||
outcode_cache: dict[str, str] = {}
|
||
|
||
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
|
||
# Requesting index >= 1008 returns HTTP 400.
|
||
_MAX_INDEX = 1008
|
||
|
||
# Property type filters for splitting overcapped searches. Each sub-query
|
||
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
|
||
_PROPERTY_TYPES = [
|
||
"detached", "semi-detached", "terraced", "flat",
|
||
"bungalow", "park-home", "land",
|
||
]
|
||
|
||
|
||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||
if outcode in outcode_cache:
|
||
return outcode_cache[outcode]
|
||
|
||
data = fetch_with_retry(
|
||
client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}
|
||
)
|
||
if not data:
|
||
return None
|
||
|
||
for match in data.get("matches", []):
|
||
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
|
||
rid = str(match["id"])
|
||
outcode_cache[outcode] = rid
|
||
return rid
|
||
|
||
log.debug("Outcode %s not found in typeahead results", outcode)
|
||
return None
|
||
|
||
|
||
def _paginate(
|
||
client: httpx.Client,
|
||
outcode_id: str,
|
||
outcode: str,
|
||
channel_cfg: dict,
|
||
pc_index: PostcodeSpatialIndex,
|
||
extra_params: dict | None = None,
|
||
) -> tuple[list[dict], int]:
|
||
"""Paginate through search results. Returns (properties, result_count)."""
|
||
properties = []
|
||
index = 0
|
||
result_count = 0
|
||
|
||
while True:
|
||
params = {
|
||
"useLocationIdentifier": "true",
|
||
"locationIdentifier": f"OUTCODE^{outcode_id}",
|
||
"index": str(index),
|
||
"sortType": channel_cfg["sortType"],
|
||
"channel": channel_cfg["channel"],
|
||
"transactionType": channel_cfg["transactionType"],
|
||
}
|
||
if extra_params:
|
||
params.update(extra_params)
|
||
|
||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||
if not data:
|
||
log.warning(
|
||
"Failed to fetch index %d for %s/%s",
|
||
index,
|
||
outcode,
|
||
channel_cfg["channel"],
|
||
)
|
||
break
|
||
|
||
raw_props = data.get("properties", [])
|
||
if not raw_props:
|
||
break
|
||
|
||
for prop in raw_props:
|
||
transformed = transform_property(prop, outcode, pc_index)
|
||
if transformed:
|
||
properties.append(transformed)
|
||
|
||
# Check if there are more pages
|
||
result_count_str = data.get("resultCount", "0")
|
||
result_count = int(result_count_str.replace(",", ""))
|
||
index += PAGE_SIZE
|
||
|
||
if index >= result_count:
|
||
break
|
||
|
||
time.sleep(DELAY_BETWEEN_PAGES)
|
||
|
||
return properties, result_count
|
||
|
||
|
||
def search_outcode(
|
||
client: httpx.Client,
|
||
outcode_id: str,
|
||
outcode: str,
|
||
channel_cfg: dict,
|
||
pc_index: PostcodeSpatialIndex,
|
||
) -> list[dict]:
|
||
"""Paginate through search results for one outcode+channel. Returns transformed properties.
|
||
|
||
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
|
||
re-queries per property type to recover listings beyond the cap.
|
||
"""
|
||
properties, result_count = _paginate(
|
||
client, outcode_id, outcode, channel_cfg, pc_index
|
||
)
|
||
|
||
if result_count <= _MAX_INDEX:
|
||
return properties
|
||
|
||
# Hit the 1008 cap — re-search per property type to get full coverage
|
||
ch = channel_cfg["channel"]
|
||
log.info(
|
||
"%s/%s: %d results exceed %d cap, splitting by property type",
|
||
outcode, ch, result_count, _MAX_INDEX,
|
||
)
|
||
|
||
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
|
||
|
||
for pt in _PROPERTY_TYPES:
|
||
pt_props, _ = _paginate(
|
||
client, outcode_id, outcode, channel_cfg, pc_index,
|
||
extra_params={"propertyTypes": pt},
|
||
)
|
||
new = 0
|
||
for p in pt_props:
|
||
if p["id"] not in all_by_id:
|
||
all_by_id[p["id"]] = p
|
||
new += 1
|
||
if new:
|
||
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
|
||
|
||
log.info(
|
||
"%s/%s: type split recovered %d → %d properties",
|
||
outcode, ch, len(properties), len(all_by_id),
|
||
)
|
||
return list(all_by_id.values())
|