scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -205,6 +205,41 @@ def extract_full_postcode(text: str | None) -> str | None:
|
|||
return normalize_postcode(match.group(1))
|
||||
|
||||
|
||||
def extract_outcode(postcode: str | None) -> str | None:
|
||||
"""Return the outward code (district) of a UK postcode, e.g. 'SW1A 1AA' → 'SW1A'."""
|
||||
if not postcode:
|
||||
return None
|
||||
normalized = normalize_postcode(postcode)
|
||||
outcode = normalized.split(" ", 1)[0]
|
||||
return outcode or None
|
||||
|
||||
|
||||
def resolve_listing_postcode(
|
||||
extracted_postcode: str | None, inferred_postcode: str
|
||||
) -> tuple[str, str]:
|
||||
"""Pick the authoritative postcode for a listing, returning (postcode, source).
|
||||
|
||||
The address-extracted postcode is more precise than the coordinate-nearest one,
|
||||
but it is only trustworthy when it agrees with the location: a stale, mistyped or
|
||||
well-formed-but-fabricated postcode (e.g. 'ZZ9 9ZZ') would otherwise silently
|
||||
override the spatially-correct value. Since the spatial index only supports
|
||||
nearest-lookup, accept the extracted postcode only when its outcode matches the
|
||||
inferred (coordinate-nearest) postcode's outcode; otherwise fall back to the
|
||||
inferred one, which is always a real, plausibly-correct postcode.
|
||||
"""
|
||||
if extracted_postcode and extract_outcode(extracted_postcode) == extract_outcode(
|
||||
inferred_postcode
|
||||
):
|
||||
return extracted_postcode, "address"
|
||||
if extracted_postcode:
|
||||
log.debug(
|
||||
"Rejecting extracted postcode %s (outcode mismatch with inferred %s)",
|
||||
extracted_postcode,
|
||||
inferred_postcode,
|
||||
)
|
||||
return inferred_postcode, "coordinates"
|
||||
|
||||
|
||||
def clean_listing_address(address: str | None) -> str:
|
||||
"""Remove postcode/outcode suffixes from listing display addresses.
|
||||
|
||||
|
|
@ -222,10 +257,48 @@ def clean_listing_address(address: str | None) -> str:
|
|||
return cleaned.strip(" ,")
|
||||
|
||||
|
||||
def build_register_address(
|
||||
raw_address: str | None, number_or_name: str | None = None
|
||||
) -> str:
|
||||
"""Build a Property Register-style address, prepending the house number/name.
|
||||
|
||||
Listing display addresses are usually street-level ("South Street, Bromley")
|
||||
because the portals hide the exact unit. When a scraper can recover the
|
||||
property's own number or name (e.g. Zoopla detail pages expose
|
||||
``propertyNumberOrName`` = "12" or "Martham Mill"), prepend it so the address
|
||||
carries the house identifier that the EPC/Price-Paid register addresses also
|
||||
use — turning a fuzzy street match into a near-exact one. Falls back to the
|
||||
plain cleaned address when no number/name is available.
|
||||
"""
|
||||
cleaned = clean_listing_address(raw_address)
|
||||
if not number_or_name:
|
||||
return cleaned
|
||||
number_or_name = number_or_name.strip()
|
||||
if not number_or_name:
|
||||
return cleaned
|
||||
# Avoid duplicating a number/name the display address already starts with.
|
||||
if cleaned.lower().startswith(number_or_name.lower()):
|
||||
return cleaned
|
||||
return f"{number_or_name}, {cleaned}" if cleaned else number_or_name
|
||||
|
||||
|
||||
def transform_property(
|
||||
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
|
||||
prop: dict,
|
||||
outcode: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
detail_postcode: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Transform a raw Rightmove property dict into our output schema."""
|
||||
"""Transform a raw Rightmove property dict into our output schema.
|
||||
|
||||
``detail_postcode`` is the property's TRUE full postcode recovered from its
|
||||
detail page (see ``rightmove.parse_detail_postcode``); the search API itself
|
||||
only exposes the outcode-level ``displayAddress``. When supplied and it
|
||||
agrees with the coordinate-nearest postcode's outcode, it is preferred over
|
||||
the coordinate guess and recorded with source ``"detail_address"``. A
|
||||
detail postcode whose outcode disagrees with the location is discarded in
|
||||
favour of the spatially-correct coordinate postcode, so a stale or wrong
|
||||
detail value can never silently relocate a listing.
|
||||
"""
|
||||
loc = prop.get("location")
|
||||
if not loc:
|
||||
return None
|
||||
|
|
@ -268,8 +341,25 @@ def transform_property(
|
|||
return None
|
||||
raw_address = prop.get("displayAddress", "") or ""
|
||||
extracted_postcode = extract_full_postcode(raw_address)
|
||||
postcode = extracted_postcode or inferred_postcode
|
||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
||||
|
||||
# Prefer the detail page's true full postcode when it agrees with the
|
||||
# location; otherwise fall back to the (display-address-or-coordinate) logic.
|
||||
detail_full = extract_full_postcode(detail_postcode)
|
||||
if detail_full and extract_outcode(detail_full) == extract_outcode(
|
||||
inferred_postcode
|
||||
):
|
||||
postcode, postcode_source = detail_full, "detail_address"
|
||||
else:
|
||||
if detail_full:
|
||||
log.debug(
|
||||
"Rejecting Rightmove detail postcode %s (outcode mismatch with "
|
||||
"inferred %s)",
|
||||
detail_full,
|
||||
inferred_postcode,
|
||||
)
|
||||
postcode, postcode_source = resolve_listing_postcode(
|
||||
extracted_postcode, inferred_postcode
|
||||
)
|
||||
|
||||
property_url = prop.get("propertyUrl") or ""
|
||||
if not isinstance(property_url, str):
|
||||
|
|
@ -291,6 +381,9 @@ def transform_property(
|
|||
"Inferred postcode": inferred_postcode,
|
||||
"Listing raw address": raw_address,
|
||||
"Address per Property Register": clean_listing_address(raw_address),
|
||||
# Rightmove's displayAddress is street-level; no UPRN/house number.
|
||||
"UPRN": None,
|
||||
"Property number or name": None,
|
||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue