vibes
This commit is contained in:
parent
39ef5c6646
commit
c995f12f8b
78 changed files with 4830 additions and 1619 deletions
|
|
@ -29,6 +29,8 @@ from constants import (
|
|||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import (
|
||||
clean_listing_address,
|
||||
extract_full_postcode,
|
||||
fix_coords,
|
||||
map_property_type,
|
||||
normalize_sub_type,
|
||||
|
|
@ -177,9 +179,13 @@ def transform_property(
|
|||
if not (49 <= lat <= 56 and -7 <= lng <= 2):
|
||||
return None
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
if not postcode:
|
||||
inferred_postcode = pc_index.nearest(lat, lng)
|
||||
if not inferred_postcode:
|
||||
return None
|
||||
raw_address = raw.get("address", "") or ""
|
||||
extracted_postcode = extract_full_postcode(raw_address)
|
||||
postcode = extracted_postcode or inferred_postcode
|
||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
||||
|
||||
raw_beds = raw.get("bedrooms") or 0
|
||||
raw_baths = raw.get("bathrooms") or 0
|
||||
|
|
@ -212,7 +218,11 @@ def transform_property(
|
|||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": raw.get("address", ""),
|
||||
"Postcode source": postcode_source,
|
||||
"Extracted postcode": extracted_postcode,
|
||||
"Inferred postcode": inferred_postcode,
|
||||
"Listing raw address": raw_address,
|
||||
"Address per Property Register": clean_listing_address(raw_address),
|
||||
"Leasehold/Freehold": _extract_tenure(features),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
|
|
|
|||
|
|
@ -105,6 +105,24 @@ def write_parquet(properties: list[dict], path: Path) -> None:
|
|||
"lon": [p["lon"] for p in properties],
|
||||
"lat": [p["lat"] for p in properties],
|
||||
"Postcode": [normalize_postcode(p["Postcode"]) for p in properties],
|
||||
"Postcode source": [p.get("Postcode source", "") for p in properties],
|
||||
"Extracted postcode": [
|
||||
normalize_postcode(p["Extracted postcode"])
|
||||
if p.get("Extracted postcode")
|
||||
else None
|
||||
for p in properties
|
||||
],
|
||||
"Inferred postcode": [
|
||||
normalize_postcode(p["Inferred postcode"])
|
||||
if p.get("Inferred postcode")
|
||||
else None
|
||||
for p in properties
|
||||
],
|
||||
"Listing raw address": [
|
||||
p.get("Listing raw address")
|
||||
or p.get("Address per Property Register", "")
|
||||
for p in properties
|
||||
],
|
||||
"Address per Property Register": [
|
||||
p["Address per Property Register"] for p in properties
|
||||
],
|
||||
|
|
@ -126,6 +144,10 @@ def write_parquet(properties: list[dict], path: Path) -> None:
|
|||
"lon": pl.Float64,
|
||||
"lat": pl.Float64,
|
||||
"Postcode": pl.Utf8,
|
||||
"Postcode source": pl.Utf8,
|
||||
"Extracted postcode": pl.Utf8,
|
||||
"Inferred postcode": pl.Utf8,
|
||||
"Listing raw address": pl.Utf8,
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Property type": pl.Utf8,
|
||||
|
|
|
|||
48
finder/test_transform.py
Normal file
48
finder/test_transform.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
from transform import (
|
||||
clean_listing_address,
|
||||
extract_full_postcode,
|
||||
transform_property,
|
||||
)
|
||||
|
||||
|
||||
class StubPostcodeIndex:
|
||||
def nearest(self, lat: float, lng: float) -> str:
|
||||
return "SW1A 9ZZ"
|
||||
|
||||
|
||||
def test_extract_full_postcode_normalizes_spacing() -> None:
|
||||
assert extract_full_postcode("10 Downing Street SW1A2AA") == "SW1A 2AA"
|
||||
assert extract_full_postcode("10 Downing Street, SW1A 2AA") == "SW1A 2AA"
|
||||
assert extract_full_postcode("Downing Street, Westminster") is None
|
||||
|
||||
|
||||
def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
|
||||
assert clean_listing_address("10 Downing Street, SW1A 2AA") == "10 Downing Street"
|
||||
assert clean_listing_address("Hawthorne Road, Bromley, Kent, BR1") == (
|
||||
"Hawthorne Road, Bromley, Kent"
|
||||
)
|
||||
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
|
||||
|
||||
|
||||
def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
||||
prop = {
|
||||
"id": "123",
|
||||
"location": {"latitude": 51.5, "longitude": -0.1},
|
||||
"price": {"amount": 750000, "displayPrices": []},
|
||||
"propertySubType": "Terraced",
|
||||
"bedrooms": 3,
|
||||
"bathrooms": 1,
|
||||
"keyFeatures": [],
|
||||
"propertyUrl": "/properties/123",
|
||||
"displayAddress": "Flat 2, 10 Downing Street, SW1A 2AA",
|
||||
}
|
||||
|
||||
result = transform_property(prop, "SW1A", StubPostcodeIndex())
|
||||
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW1A 2AA"
|
||||
assert result["Postcode source"] == "address"
|
||||
assert result["Extracted postcode"] == "SW1A 2AA"
|
||||
assert result["Inferred postcode"] == "SW1A 9ZZ"
|
||||
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
|
||||
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
|
||||
|
|
@ -14,6 +14,18 @@ log = logging.getLogger("rightmove")
|
|||
# UK mansions.
|
||||
MIN_FLOOR_AREA_SQM = 5.0
|
||||
MAX_FLOOR_AREA_SQM = 2000.0
|
||||
FULL_POSTCODE_RE = re.compile(
|
||||
r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
TRAILING_FULL_POSTCODE_RE = re.compile(
|
||||
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
TRAILING_OUTCODE_RE = re.compile(
|
||||
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\b\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def validate_floor_area(sqm: float | None) -> float | None:
|
||||
|
|
@ -184,6 +196,32 @@ def normalize_postcode(postcode: str) -> str:
|
|||
return compact[:-3] + " " + compact[-3:]
|
||||
|
||||
|
||||
def extract_full_postcode(text: str | None) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
match = FULL_POSTCODE_RE.search(text)
|
||||
if not match:
|
||||
return None
|
||||
return normalize_postcode(match.group(1))
|
||||
|
||||
|
||||
def clean_listing_address(address: str | None) -> str:
|
||||
"""Remove postcode/outcode suffixes from listing display addresses.
|
||||
|
||||
Listing sites often include "..., BR1" or "..., SW1A 1AA" in their public
|
||||
address. Those tokens add fake address numbers to the fuzzy matcher, so keep
|
||||
the raw address separately and use this cleaned value for matching.
|
||||
"""
|
||||
if not address:
|
||||
return ""
|
||||
cleaned = str(address).strip()
|
||||
cleaned = TRAILING_FULL_POSTCODE_RE.sub("", cleaned)
|
||||
cleaned = TRAILING_OUTCODE_RE.sub("", cleaned)
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
cleaned = re.sub(r"\s*,\s*", ", ", cleaned)
|
||||
return cleaned.strip(" ,")
|
||||
|
||||
|
||||
def transform_property(
|
||||
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
|
||||
) -> dict | None:
|
||||
|
|
@ -224,10 +262,14 @@ def transform_property(
|
|||
if kf.get("description")
|
||||
]
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
if not postcode:
|
||||
inferred_postcode = pc_index.nearest(lat, lng)
|
||||
if not inferred_postcode:
|
||||
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
raw_address = prop.get("displayAddress", "") or ""
|
||||
extracted_postcode = extract_full_postcode(raw_address)
|
||||
postcode = extracted_postcode or inferred_postcode
|
||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
||||
|
||||
property_url = prop.get("propertyUrl") or ""
|
||||
if not isinstance(property_url, str):
|
||||
|
|
@ -244,7 +286,11 @@ def transform_property(
|
|||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": prop.get("displayAddress", ""),
|
||||
"Postcode source": postcode_source,
|
||||
"Extracted postcode": extracted_postcode,
|
||||
"Inferred postcode": inferred_postcode,
|
||||
"Listing raw address": raw_address,
|
||||
"Address per Property Register": clean_listing_address(raw_address),
|
||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
|
|
|
|||
|
|
@ -37,7 +37,13 @@ from constants import (
|
|||
ZOOPLA_BASE,
|
||||
)
|
||||
from spatial import PostcodeSpatialIndex
|
||||
from transform import normalize_sub_type, parse_int_value, validate_floor_area
|
||||
from transform import (
|
||||
clean_listing_address,
|
||||
extract_full_postcode,
|
||||
normalize_sub_type,
|
||||
parse_int_value,
|
||||
validate_floor_area,
|
||||
)
|
||||
|
||||
log = logging.getLogger("zoopla")
|
||||
|
||||
|
|
@ -1031,19 +1037,6 @@ def _resolve_outcode_coords(
|
|||
return None
|
||||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.
|
||||
Normalizes to include a space before the 3-char incode."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
if match:
|
||||
raw = match.group(1).upper().strip()
|
||||
# Ensure space before incode (last 3 chars): "SW1A1AA" → "SW1A 1AA"
|
||||
if " " not in raw and len(raw) >= 5:
|
||||
return raw[:-3] + " " + raw[-3:]
|
||||
return raw
|
||||
return None
|
||||
|
||||
|
||||
def _extract_outcode(text: str) -> str | None:
|
||||
"""Extract a UK outcode from address text like 'Whitechapel Road, London E1'."""
|
||||
# Look for outcode at end of string or after last comma
|
||||
|
|
@ -1123,10 +1116,12 @@ def transform_property(
|
|||
from postcodes extracted from the address text."""
|
||||
price = parse_int_value(raw.get("price")) or 0
|
||||
|
||||
address = raw.get("address", "")
|
||||
address = raw.get("address", "") or ""
|
||||
|
||||
# Resolve postcode and coordinates from address
|
||||
postcode = _extract_postcode(address)
|
||||
extracted_postcode = extract_full_postcode(address)
|
||||
postcode = extracted_postcode
|
||||
postcode_source = "address" if extracted_postcode else None
|
||||
lat = lng = None
|
||||
|
||||
if postcode:
|
||||
|
|
@ -1141,12 +1136,14 @@ def transform_property(
|
|||
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
postcode_source = "address_outcode"
|
||||
|
||||
# Final fallback: use the outcode we know we're searching
|
||||
if lat is None and search_outcode:
|
||||
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
postcode_source = "search_outcode"
|
||||
|
||||
if lat is None or lng is None or not postcode:
|
||||
return None
|
||||
|
|
@ -1189,7 +1186,11 @@ def transform_property(
|
|||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
"Postcode source": postcode_source or "unknown",
|
||||
"Extracted postcode": extracted_postcode,
|
||||
"Inferred postcode": postcode if postcode_source != "address" else None,
|
||||
"Listing raw address": address,
|
||||
"Address per Property Register": clean_listing_address(address),
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": normalize_sub_type(raw.get("property_type")),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue