scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
return normalize_postcode(match.group(1))
def extract_outcode(postcode: str | None) -> str | None:
"""Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA''SW1A'."""
if not postcode:
return None
normalized = normalize_postcode(postcode)
outcode = normalized.split(" ", 1)[0]
return outcode or None
def resolve_listing_postcode(
extracted_postcode: str | None, inferred_postcode: str
) -> tuple[str, str]:
"""Pick the authoritative postcode for a listing, returning (postcode, source).
The address-extracted postcode is more precise than the coordinate-nearest one,
but it is only trustworthy when it agrees with the location: a stale, mistyped or
well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
override the spatially-correct value. Since the spatial index only supports
nearest-lookup, accept the extracted postcode only when its outcode matches the
inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
inferred one, which is always a real, plausibly-correct postcode.
"""
if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
inferred_postcode
):
return extracted_postcode, "address"
if extracted_postcode:
log.debug(
"Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
extracted_postcode,
inferred_postcode,
)
return inferred_postcode, "coordinates"
def clean_listing_address(address: str | None) -> str:
"""Remove postcode/outcode suffixes from listing display addresses.
@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
return cleaned.strip(" ,")
def build_register_address(
raw_address: str | None, number_or_name: str | None = None
) -> str:
"""Build a Property Register-style address, prepending the house number/name.
Listing display addresses are usually street-level ("South Street, Bromley")
because the portals hide the exact unit. When a scraper can recover the
property's own number or name (e.g. Zoopla detail pages expose
``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
carries the house identifier that the EPC/Price-Paid register addresses also
use turning a fuzzy street match into a near-exact one. Falls back to the
plain cleaned address when no number/name is available.
"""
cleaned = clean_listing_address(raw_address)
if not number_or_name:
return cleaned
number_or_name = number_or_name.strip()
if not number_or_name:
return cleaned
# Avoid duplicating a number/name the display address already starts with.
if cleaned.lower().startswith(number_or_name.lower()):
return cleaned
return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
def transform_property(
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
prop: dict,
outcode: str,
pc_index: PostcodeSpatialIndex,
detail_postcode: str | None = None,
) -> dict | None:
"""Transform a raw Rightmove property dict into our output schema."""
"""Transform a raw Rightmove property dict into our output schema.
``detail_postcode`` is the property's TRUE full postcode recovered from its
detail page (see ``rightmove.parse_detail_postcode``); the search API itself
only exposes the outcode-level ``displayAddress``. When supplied and it
agrees with the coordinate-nearest postcode's outcode, it is preferred over
the coordinate guess and recorded with source ``"detail_address"``. A
detail postcode whose outcode disagrees with the location is discarded in
favour of the spatially-correct coordinate postcode, so a stale or wrong
detail value can never silently relocate a listing.
"""
loc = prop.get("location")
if not loc:
return None
@ -268,8 +341,25 @@ def transform_property(
return None
raw_address = prop.get("displayAddress", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates"
# Prefer the detail page's true full postcode when it agrees with the
# location; otherwise fall back to the (display-address-or-coordinate) logic.
detail_full = extract_full_postcode(detail_postcode)
if detail_full and extract_outcode(detail_full) == extract_outcode(
inferred_postcode
):
postcode, postcode_source = detail_full, "detail_address"
else:
if detail_full:
log.debug(
"Rejecting Rightmove detail postcode %s (outcode mismatch with "
"inferred %s)",
detail_full,
inferred_postcode,
)
postcode, postcode_source = resolve_listing_postcode(
extracted_postcode, inferred_postcode
)
property_url = prop.get("propertyUrl") or ""
if not isinstance(property_url, str):
@ -291,6 +381,9 @@ def transform_property(
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
# Rightmove's displayAddress is street-level; no UPRN/house number.
"UPRN": None,
"Property number or name": None,
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),