This commit is contained in:
Andras Schmelczer 2026-05-28 21:48:35 +01:00
parent 39ef5c6646
commit c995f12f8b
78 changed files with 4830 additions and 1619 deletions

View file

@ -29,6 +29,8 @@ from constants import (
)
from spatial import PostcodeSpatialIndex
from transform import (
clean_listing_address,
extract_full_postcode,
fix_coords,
map_property_type,
normalize_sub_type,
@ -177,9 +179,13 @@ def transform_property(
if not (49 <= lat <= 56 and -7 <= lng <= 2):
return None
postcode = pc_index.nearest(lat, lng)
if not postcode:
inferred_postcode = pc_index.nearest(lat, lng)
if not inferred_postcode:
return None
raw_address = raw.get("address", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates"
raw_beds = raw.get("bedrooms") or 0
raw_baths = raw.get("bathrooms") or 0
@ -212,7 +218,11 @@ def transform_property(
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Address per Property Register": raw.get("address", ""),
"Postcode source": postcode_source,
"Extracted postcode": extracted_postcode,
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
"Leasehold/Freehold": _extract_tenure(features),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),

View file

@ -105,6 +105,24 @@ def write_parquet(properties: list[dict], path: Path) -> None:
"lon": [p["lon"] for p in properties],
"lat": [p["lat"] for p in properties],
"Postcode": [normalize_postcode(p["Postcode"]) for p in properties],
"Postcode source": [p.get("Postcode source", "") for p in properties],
"Extracted postcode": [
normalize_postcode(p["Extracted postcode"])
if p.get("Extracted postcode")
else None
for p in properties
],
"Inferred postcode": [
normalize_postcode(p["Inferred postcode"])
if p.get("Inferred postcode")
else None
for p in properties
],
"Listing raw address": [
p.get("Listing raw address")
or p.get("Address per Property Register", "")
for p in properties
],
"Address per Property Register": [
p["Address per Property Register"] for p in properties
],
@ -126,6 +144,10 @@ def write_parquet(properties: list[dict], path: Path) -> None:
"lon": pl.Float64,
"lat": pl.Float64,
"Postcode": pl.Utf8,
"Postcode source": pl.Utf8,
"Extracted postcode": pl.Utf8,
"Inferred postcode": pl.Utf8,
"Listing raw address": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,

48
finder/test_transform.py Normal file
View file

@ -0,0 +1,48 @@
from transform import (
clean_listing_address,
extract_full_postcode,
transform_property,
)
class StubPostcodeIndex:
def nearest(self, lat: float, lng: float) -> str:
return "SW1A 9ZZ"
def test_extract_full_postcode_normalizes_spacing() -> None:
assert extract_full_postcode("10 Downing Street SW1A2AA") == "SW1A 2AA"
assert extract_full_postcode("10 Downing Street, SW1A 2AA") == "SW1A 2AA"
assert extract_full_postcode("Downing Street, Westminster") is None
def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
assert clean_listing_address("10 Downing Street, SW1A 2AA") == "10 Downing Street"
assert clean_listing_address("Hawthorne Road, Bromley, Kent, BR1") == (
"Hawthorne Road, Bromley, Kent"
)
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
prop = {
"id": "123",
"location": {"latitude": 51.5, "longitude": -0.1},
"price": {"amount": 750000, "displayPrices": []},
"propertySubType": "Terraced",
"bedrooms": 3,
"bathrooms": 1,
"keyFeatures": [],
"propertyUrl": "/properties/123",
"displayAddress": "Flat 2, 10 Downing Street, SW1A 2AA",
}
result = transform_property(prop, "SW1A", StubPostcodeIndex())
assert result is not None
assert result["Postcode"] == "SW1A 2AA"
assert result["Postcode source"] == "address"
assert result["Extracted postcode"] == "SW1A 2AA"
assert result["Inferred postcode"] == "SW1A 9ZZ"
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"

View file

@ -14,6 +14,18 @@ log = logging.getLogger("rightmove")
# UK mansions.
MIN_FLOOR_AREA_SQM = 5.0
MAX_FLOOR_AREA_SQM = 2000.0
FULL_POSTCODE_RE = re.compile(
r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b",
re.IGNORECASE,
)
TRAILING_FULL_POSTCODE_RE = re.compile(
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b\s*$",
re.IGNORECASE,
)
TRAILING_OUTCODE_RE = re.compile(
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\b\s*$",
re.IGNORECASE,
)
def validate_floor_area(sqm: float | None) -> float | None:
@ -184,6 +196,32 @@ def normalize_postcode(postcode: str) -> str:
return compact[:-3] + " " + compact[-3:]
def extract_full_postcode(text: str | None) -> str | None:
if not text:
return None
match = FULL_POSTCODE_RE.search(text)
if not match:
return None
return normalize_postcode(match.group(1))
def clean_listing_address(address: str | None) -> str:
"""Remove postcode/outcode suffixes from listing display addresses.
Listing sites often include "..., BR1" or "..., SW1A 1AA" in their public
address. Those tokens add fake address numbers to the fuzzy matcher, so keep
the raw address separately and use this cleaned value for matching.
"""
if not address:
return ""
cleaned = str(address).strip()
cleaned = TRAILING_FULL_POSTCODE_RE.sub("", cleaned)
cleaned = TRAILING_OUTCODE_RE.sub("", cleaned)
cleaned = re.sub(r"\s+", " ", cleaned)
cleaned = re.sub(r"\s*,\s*", ", ", cleaned)
return cleaned.strip(" ,")
def transform_property(
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
) -> dict | None:
@ -224,10 +262,14 @@ def transform_property(
if kf.get("description")
]
postcode = pc_index.nearest(lat, lng)
if not postcode:
inferred_postcode = pc_index.nearest(lat, lng)
if not inferred_postcode:
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
return None
raw_address = prop.get("displayAddress", "") or ""
extracted_postcode = extract_full_postcode(raw_address)
postcode = extracted_postcode or inferred_postcode
postcode_source = "address" if extracted_postcode else "coordinates"
property_url = prop.get("propertyUrl") or ""
if not isinstance(property_url, str):
@ -244,7 +286,11 @@ def transform_property(
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Address per Property Register": prop.get("displayAddress", ""),
"Postcode source": postcode_source,
"Extracted postcode": extracted_postcode,
"Inferred postcode": inferred_postcode,
"Listing raw address": raw_address,
"Address per Property Register": clean_listing_address(raw_address),
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": normalize_sub_type(sub_type),

View file

@ -37,7 +37,13 @@ from constants import (
ZOOPLA_BASE,
)
from spatial import PostcodeSpatialIndex
from transform import normalize_sub_type, parse_int_value, validate_floor_area
from transform import (
clean_listing_address,
extract_full_postcode,
normalize_sub_type,
parse_int_value,
validate_floor_area,
)
log = logging.getLogger("zoopla")
@ -1031,19 +1037,6 @@ def _resolve_outcode_coords(
return None
def _extract_postcode(text: str) -> str | None:
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.
Normalizes to include a space before the 3-char incode."""
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
if match:
raw = match.group(1).upper().strip()
# Ensure space before incode (last 3 chars): "SW1A1AA" → "SW1A 1AA"
if " " not in raw and len(raw) >= 5:
return raw[:-3] + " " + raw[-3:]
return raw
return None
def _extract_outcode(text: str) -> str | None:
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
# Look for outcode at end of string or after last comma
@ -1123,10 +1116,12 @@ def transform_property(
from postcodes extracted from the address text."""
price = parse_int_value(raw.get("price")) or 0
address = raw.get("address", "")
address = raw.get("address", "") or ""
# Resolve postcode and coordinates from address
postcode = _extract_postcode(address)
extracted_postcode = extract_full_postcode(address)
postcode = extracted_postcode
postcode_source = "address" if extracted_postcode else None
lat = lng = None
if postcode:
@ -1141,12 +1136,14 @@ def transform_property(
result = _resolve_outcode_coords(addr_outcode, pc_coords)
if result:
postcode, lat, lng = result
postcode_source = "address_outcode"
# Final fallback: use the outcode we know we're searching
if lat is None and search_outcode:
result = _resolve_outcode_coords(search_outcode, pc_coords)
if result:
postcode, lat, lng = result
postcode_source = "search_outcode"
if lat is None or lng is None or not postcode:
return None
@ -1189,7 +1186,11 @@ def transform_property(
"lon": lng,
"lat": lat,
"Postcode": postcode,
"Address per Property Register": address,
"Postcode source": postcode_source or "unknown",
"Extracted postcode": extracted_postcode,
"Inferred postcode": postcode if postcode_source != "address" else None,
"Listing raw address": address,
"Address per Property Register": clean_listing_address(address),
"Leasehold/Freehold": raw.get("tenure") or None,
"Property type": _map_property_type(raw.get("property_type")),
"Property sub-type": normalize_sub_type(raw.get("property_type")),