Remove finder

This commit is contained in:
Andras Schmelczer 2026-04-04 22:59:28 +01:00
parent 55238f59aa
commit cd778dd088
26 changed files with 0 additions and 57826 deletions

View file

@ -1,156 +0,0 @@
import logging
import time
import httpx
from constants import (
PAGE_SIZE,
DELAY_BETWEEN_PAGES,
SEARCH_URL,
TYPEAHEAD_URL,
)
from http_client import fetch_with_retry
from spatial import PostcodeSpatialIndex
from transform import transform_property
log = logging.getLogger("rightmove")
# Outcode ID cache (Rightmove typeahead → internal ID)
outcode_cache: dict[str, str] = {}
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
# Requesting index >= 1008 returns HTTP 400.
_MAX_INDEX = 1008
# Property type filters for splitting overcapped searches. Each sub-query
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
_PROPERTY_TYPES = [
"detached", "semi-detached", "terraced", "flat",
"bungalow", "park-home", "land",
]
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
if outcode in outcode_cache:
return outcode_cache[outcode]
data = fetch_with_retry(
client, TYPEAHEAD_URL, {"query": outcode, "limit": "10", "exclude": "STREET"}
)
if not data:
return None
for match in data.get("matches", []):
if match.get("type") == "OUTCODE" and match.get("displayName") == outcode:
rid = str(match["id"])
outcode_cache[outcode] = rid
return rid
log.debug("Outcode %s not found in typeahead results", outcode)
return None
def _paginate(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
extra_params: dict | None = None,
) -> tuple[list[dict], int]:
"""Paginate through search results. Returns (properties, result_count)."""
properties = []
index = 0
result_count = 0
while True:
params = {
"useLocationIdentifier": "true",
"locationIdentifier": f"OUTCODE^{outcode_id}",
"index": str(index),
"sortType": channel_cfg["sortType"],
"channel": channel_cfg["channel"],
"transactionType": channel_cfg["transactionType"],
}
if extra_params:
params.update(extra_params)
data = fetch_with_retry(client, SEARCH_URL, params)
if not data:
log.warning(
"Failed to fetch index %d for %s/%s",
index,
outcode,
channel_cfg["channel"],
)
break
raw_props = data.get("properties", [])
if not raw_props:
break
for prop in raw_props:
transformed = transform_property(prop, outcode, pc_index)
if transformed:
properties.append(transformed)
# Check if there are more pages
result_count_str = data.get("resultCount", "0")
result_count = int(result_count_str.replace(",", ""))
index += PAGE_SIZE
if index >= result_count:
break
time.sleep(DELAY_BETWEEN_PAGES)
return properties, result_count
def search_outcode(
client: httpx.Client,
outcode_id: str,
outcode: str,
channel_cfg: dict,
pc_index: PostcodeSpatialIndex,
) -> list[dict]:
"""Paginate through search results for one outcode+channel. Returns transformed properties.
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
re-queries per property type to recover listings beyond the cap.
"""
properties, result_count = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index
)
if result_count <= _MAX_INDEX:
return properties
# Hit the 1008 cap — re-search per property type to get full coverage
ch = channel_cfg["channel"]
log.info(
"%s/%s: %d results exceed %d cap, splitting by property type",
outcode, ch, result_count, _MAX_INDEX,
)
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
for pt in _PROPERTY_TYPES:
pt_props, _ = _paginate(
client, outcode_id, outcode, channel_cfg, pc_index,
extra_params={"propertyTypes": pt},
)
new = 0
for p in pt_props:
if p["id"] not in all_by_id:
all_by_id[p["id"]] = p
new += 1
if new:
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
log.info(
"%s/%s: type split recovered %d%d properties",
outcode, ch, len(properties), len(all_by_id),
)
return list(all_by_id.values())