Fix scrape
This commit is contained in:
parent
bbc2fcb86c
commit
3adbaf435d
2 changed files with 131 additions and 27 deletions
|
|
@ -18,6 +18,17 @@ log = logging.getLogger("rightmove")
|
|||
# Outcode ID cache (Rightmove typeahead → internal ID)
|
||||
outcode_cache: dict[str, str] = {}
|
||||
|
||||
# Rightmove hard-caps pagination at index 1008 (42 pages × 24 results).
|
||||
# Requesting index >= 1008 returns HTTP 400.
|
||||
_MAX_INDEX = 1008
|
||||
|
||||
# Property type filters for splitting overcapped searches. Each sub-query
|
||||
# gets its own 1008 cap, so we can recover listings beyond the unfiltered limit.
|
||||
_PROPERTY_TYPES = [
|
||||
"detached", "semi-detached", "terraced", "flat",
|
||||
"bungalow", "park-home", "land",
|
||||
]
|
||||
|
||||
|
||||
def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
||||
"""Look up Rightmove's internal ID for an outcode via typeahead API."""
|
||||
|
|
@ -40,16 +51,18 @@ def resolve_outcode_id(client: httpx.Client, outcode: str) -> str | None:
|
|||
return None
|
||||
|
||||
|
||||
def search_outcode(
|
||||
def _paginate(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties."""
|
||||
extra_params: dict | None = None,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Paginate through search results. Returns (properties, result_count)."""
|
||||
properties = []
|
||||
index = 0
|
||||
result_count = 0
|
||||
|
||||
while True:
|
||||
params = {
|
||||
|
|
@ -60,6 +73,8 @@ def search_outcode(
|
|||
"channel": channel_cfg["channel"],
|
||||
"transactionType": channel_cfg["transactionType"],
|
||||
}
|
||||
if extra_params:
|
||||
params.update(extra_params)
|
||||
|
||||
data = fetch_with_retry(client, SEARCH_URL, params)
|
||||
if not data:
|
||||
|
|
@ -90,4 +105,52 @@ def search_outcode(
|
|||
|
||||
time.sleep(DELAY_BETWEEN_PAGES)
|
||||
|
||||
return properties
|
||||
return properties, result_count
|
||||
|
||||
|
||||
def search_outcode(
|
||||
client: httpx.Client,
|
||||
outcode_id: str,
|
||||
outcode: str,
|
||||
channel_cfg: dict,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
) -> list[dict]:
|
||||
"""Paginate through search results for one outcode+channel. Returns transformed properties.
|
||||
|
||||
When the unfiltered result count exceeds 1008 (Rightmove's hard pagination cap),
|
||||
re-queries per property type to recover listings beyond the cap.
|
||||
"""
|
||||
properties, result_count = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index
|
||||
)
|
||||
|
||||
if result_count <= _MAX_INDEX:
|
||||
return properties
|
||||
|
||||
# Hit the 1008 cap — re-search per property type to get full coverage
|
||||
ch = channel_cfg["channel"]
|
||||
log.info(
|
||||
"%s/%s: %d results exceed %d cap, splitting by property type",
|
||||
outcode, ch, result_count, _MAX_INDEX,
|
||||
)
|
||||
|
||||
all_by_id: dict[str, dict] = {p["id"]: p for p in properties}
|
||||
|
||||
for pt in _PROPERTY_TYPES:
|
||||
pt_props, _ = _paginate(
|
||||
client, outcode_id, outcode, channel_cfg, pc_index,
|
||||
extra_params={"propertyTypes": pt},
|
||||
)
|
||||
new = 0
|
||||
for p in pt_props:
|
||||
if p["id"] not in all_by_id:
|
||||
all_by_id[p["id"]] = p
|
||||
new += 1
|
||||
if new:
|
||||
log.debug("%s/%s type=%s: +%d new properties", outcode, ch, pt, new)
|
||||
|
||||
log.info(
|
||||
"%s/%s: type split recovered %d → %d properties",
|
||||
outcode, ch, len(properties), len(all_by_id),
|
||||
)
|
||||
return list(all_by_id.values())
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ class TurnstileError(Exception):
|
|||
|
||||
|
||||
# Maximum search result pages to scrape per outcode (25 listings/page)
|
||||
MAX_PAGES_PER_OUTCODE = 10
|
||||
MAX_PAGES_PER_OUTCODE = 40
|
||||
|
||||
# JavaScript to extract listings from the rendered DOM.
|
||||
# Uses data-testid attributes as primary selectors (stable across deployments),
|
||||
|
|
@ -98,6 +98,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
// Extract property type (e.g., "2 bed flat for sale" → "flat")
|
||||
let property_type = '';
|
||||
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
if (ptMatch) property_type = ptMatch[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
|
|
@ -106,7 +112,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -160,6 +166,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
// Extract property type
|
||||
let property_type = '';
|
||||
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
if (ptMatch2) property_type = ptMatch2[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
|
|
@ -168,7 +180,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
@ -557,6 +569,32 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans
|
||||
# over 2.26M postcodes. Populated lazily on first lookup per outcode.
|
||||
_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {}
|
||||
|
||||
|
||||
def _resolve_outcode_coords(
|
||||
outcode: str, pc_coords: dict[str, tuple[float, float]]
|
||||
) -> tuple[str, float, float] | None:
|
||||
"""Find first postcode + coords for an outcode. Result is cached."""
|
||||
if outcode in _outcode_coords_cache:
|
||||
return _outcode_coords_cache[outcode]
|
||||
|
||||
prefix = outcode + " "
|
||||
for pcd, (lat, lng) in pc_coords.items():
|
||||
if pcd.startswith(prefix) or (
|
||||
len(outcode) >= 4
|
||||
and pcd.startswith(outcode)
|
||||
and len(pcd) > len(outcode)
|
||||
):
|
||||
_outcode_coords_cache[outcode] = (pcd, lat, lng)
|
||||
return (pcd, lat, lng)
|
||||
|
||||
_outcode_coords_cache[outcode] = None
|
||||
return None
|
||||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
|
|
@ -585,11 +623,17 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
"""Map Zoopla property type text to canonical type."""
|
||||
if not raw_type:
|
||||
return "Other"
|
||||
# Exact match (handles Rightmove-style capitalised values)
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Keyword fallback
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
|
|
@ -622,6 +666,7 @@ def transform_property(
|
|||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
search_outcode: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Transform a raw Zoopla listing dict into the standard output schema.
|
||||
|
||||
|
|
@ -643,22 +688,18 @@ def transform_property(
|
|||
lat, lng = coords
|
||||
|
||||
if lat is None:
|
||||
# Try outcode-level fallback
|
||||
outcode = _extract_outcode(address)
|
||||
if outcode:
|
||||
# ONSPD 7-char format: 4-char outcodes have no space before incode
|
||||
# (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
|
||||
# Check both formats to handle all outcode lengths.
|
||||
prefix = outcode + " "
|
||||
for pcd, coords in pc_coords.items():
|
||||
if pcd.startswith(prefix) or (
|
||||
len(outcode) >= 4
|
||||
and pcd.startswith(outcode)
|
||||
and len(pcd) > len(outcode)
|
||||
):
|
||||
postcode = pcd
|
||||
lat, lng = coords
|
||||
break
|
||||
# Try outcode-level fallback from address text
|
||||
addr_outcode = _extract_outcode(address)
|
||||
if addr_outcode:
|
||||
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
|
||||
# Final fallback: use the outcode we know we're searching
|
||||
if lat is None and search_outcode:
|
||||
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
|
||||
if lat is None or lng is None or not postcode:
|
||||
return None
|
||||
|
|
@ -706,8 +747,8 @@ def transform_property(
|
|||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
||||
"Property sub-type": "",
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": raw.get("property_type") or "",
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
|
|
@ -774,7 +815,7 @@ def search_outcode(
|
|||
properties = []
|
||||
dropped = 0
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords)
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue