This commit is contained in:
Andras Schmelczer 2026-05-28 21:48:35 +01:00
parent 39ef5c6646
commit c995f12f8b
78 changed files with 4830 additions and 1619 deletions

View file

@ -14,6 +14,18 @@ log = logging.getLogger("rightmove")
# UK mansions.
MIN_FLOOR_AREA_SQM = 5.0
MAX_FLOOR_AREA_SQM = 2000.0
FULL_POSTCODE_RE = re.compile(
r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b",
re.IGNORECASE,
)
TRAILING_FULL_POSTCODE_RE = re.compile(
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b\s*$",
re.IGNORECASE,
)
TRAILING_OUTCODE_RE = re.compile(
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\b\s*$",
re.IGNORECASE,
)
def validate_floor_area(sqm: float | None) -> float | None:
@ -184,6 +196,32 @@ def normalize_postcode(postcode: str) -> str:
return compact[:-3] + " " + compact[-3:]
def extract_full_postcode(text: str | None) -> str | None:
if not text:
return None
match = FULL_POSTCODE_RE.search(text)
if not match:
return None
return normalize_postcode(match.group(1))
def clean_listing_address(address: str | None) -> str:
"""Remove postcode/outcode suffixes from listing display addresses.
Listing sites often include "..., BR1" or "..., SW1A 1AA" in their public
address. Those tokens add fake address numbers to the fuzzy matcher, so keep
the raw address separately and use this cleaned value for matching.
"""
if not address:
return ""
cleaned = str(address).strip()
cleaned = TRAILING_FULL_POSTCODE_RE.sub("", cleaned)
cleaned = TRAILING_OUTCODE_RE.sub("", cleaned)
cleaned = re.sub(r"\s+", " ", cleaned)
cleaned = re.sub(r"\s*,\s*", ", ", cleaned)
return cleaned.strip(" ,")
def transform_property(
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
) -> dict | None:
@ -224,10 +262,14 @@ def transform_property(
if kf.get("description")
]
postcode = pc_index.nearest(lat, lng)
if not postcode:
inferred_postcode = pc_index.nearest(lat, lng)
if not inferred_postcode:
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
return None
raw_address = prop.get("displayAddress", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates"
property_url = prop.get("propertyUrl") or ""
if not isinstance(property_url, str):
@ -244,7 +286,11 @@ def transform_property(
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Address per Property Register": prop.get("displayAddress", ""),
"Postcode source": postcode_source,
"Extracted postcode": extracted_postcode,
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),