This commit is contained in:
Andras Schmelczer 2026-05-28 21:48:35 +01:00
parent 39ef5c6646
commit c995f12f8b
78 changed files with 4830 additions and 1619 deletions

View file

@ -37,7 +37,13 @@ from constants import (
ZOOPLA_BASE,
)
from spatial import PostcodeSpatialIndex
from transform import normalize_sub_type, parse_int_value, validate_floor_area
from transform import (
clean_listing_address,
extract_full_postcode,
normalize_sub_type,
parse_int_value,
validate_floor_area,
)
log = logging.getLogger("zoopla")
@ -1031,19 +1037,6 @@ def _resolve_outcode_coords(
return None
def _extract_postcode(text: str) -> str | None:
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.
Normalizes to include a space before the 3-char incode."""
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
if match:
raw = match.group(1).upper().strip()
# Ensure space before incode (last 3 chars): "SW1A1AA" → "SW1A 1AA"
if " " not in raw and len(raw) >= 5:
return raw[:-3] + " " + raw[-3:]
return raw
return None
def _extract_outcode(text: str) -> str | None:
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
# Look for outcode at end of string or after last comma
@ -1123,10 +1116,12 @@ def transform_property(
from postcodes extracted from the address text."""
price = parse_int_value(raw.get("price")) or 0
address = raw.get("address", "")
address = raw.get("address", "") or ""
# Resolve postcode and coordinates from address
postcode = _extract_postcode(address)
extracted_postcode = extract_full_postcode(address)
postcode = extracted_postcode
postcode_source = "address" if extracted_postcode else None
lat = lng = None
if postcode:
@ -1141,12 +1136,14 @@ def transform_property(
result = _resolve_outcode_coords(addr_outcode, pc_coords)
if result:
postcode, lat, lng = result
postcode_source = "address_outcode"
# Final fallback: use the outcode we know we're searching
if lat is None and search_outcode:
result = _resolve_outcode_coords(search_outcode, pc_coords)
if result:
postcode, lat, lng = result
postcode_source = "search_outcode"
if lat is None or lng is None or not postcode:
return None
@ -1189,7 +1186,11 @@ def transform_property(
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Address per Property Register": address,
"Postcode source": postcode_source or "unknown",
"Extracted postcode": extracted_postcode,
"Inferred postcode": postcode if postcode_source != "address" else None,
"Listing raw address": address,
"Address per Property Register": clean_listing_address(address),
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": normalize_sub_type(raw.get("property_type")),