vibes
This commit is contained in:
parent
39ef5c6646
commit
c995f12f8b
78 changed files with 4830 additions and 1619 deletions
|
|
@ -14,6 +14,18 @@ log = logging.getLogger("rightmove")
|
|||
# UK mansions.
|
||||
MIN_FLOOR_AREA_SQM = 5.0
|
||||
MAX_FLOOR_AREA_SQM = 2000.0
|
||||
FULL_POSTCODE_RE = re.compile(
|
||||
r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
TRAILING_FULL_POSTCODE_RE = re.compile(
|
||||
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}\b\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
TRAILING_OUTCODE_RE = re.compile(
|
||||
r"(?:,?\s*)\b[A-Z]{1,2}\d[A-Z\d]?\b\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def validate_floor_area(sqm: float | None) -> float | None:
|
||||
|
|
@ -184,6 +196,32 @@ def normalize_postcode(postcode: str) -> str:
|
|||
return compact[:-3] + " " + compact[-3:]
|
||||
|
||||
|
||||
def extract_full_postcode(text: str | None) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
match = FULL_POSTCODE_RE.search(text)
|
||||
if not match:
|
||||
return None
|
||||
return normalize_postcode(match.group(1))
|
||||
|
||||
|
||||
def clean_listing_address(address: str | None) -> str:
|
||||
"""Remove postcode/outcode suffixes from listing display addresses.
|
||||
|
||||
Listing sites often include "..., BR1" or "..., SW1A 1AA" in their public
|
||||
address. Those tokens add fake address numbers to the fuzzy matcher, so keep
|
||||
the raw address separately and use this cleaned value for matching.
|
||||
"""
|
||||
if not address:
|
||||
return ""
|
||||
cleaned = str(address).strip()
|
||||
cleaned = TRAILING_FULL_POSTCODE_RE.sub("", cleaned)
|
||||
cleaned = TRAILING_OUTCODE_RE.sub("", cleaned)
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
cleaned = re.sub(r"\s*,\s*", ", ", cleaned)
|
||||
return cleaned.strip(" ,")
|
||||
|
||||
|
||||
def transform_property(
|
||||
prop: dict, outcode: str, pc_index: PostcodeSpatialIndex
|
||||
) -> dict | None:
|
||||
|
|
@ -224,10 +262,14 @@ def transform_property(
|
|||
if kf.get("description")
|
||||
]
|
||||
|
||||
postcode = pc_index.nearest(lat, lng)
|
||||
if not postcode:
|
||||
inferred_postcode = pc_index.nearest(lat, lng)
|
||||
if not inferred_postcode:
|
||||
log.debug("No England postcode for property at %.4f, %.4f — skipping", lat, lng)
|
||||
return None
|
||||
raw_address = prop.get("displayAddress", "") or ""
|
||||
extracted_postcode = extract_full_postcode(raw_address)
|
||||
postcode = extracted_postcode or inferred_postcode
|
||||
postcode_source = "address" if extracted_postcode else "coordinates"
|
||||
|
||||
property_url = prop.get("propertyUrl") or ""
|
||||
if not isinstance(property_url, str):
|
||||
|
|
@ -244,7 +286,11 @@ def transform_property(
|
|||
"lon": lng,
|
||||
"lat": lat,
|
||||
"Postcode": postcode,
|
||||
"Address per Property Register": prop.get("displayAddress", ""),
|
||||
"Postcode source": postcode_source,
|
||||
"Extracted postcode": extracted_postcode,
|
||||
"Inferred postcode": inferred_postcode,
|
||||
"Listing raw address": raw_address,
|
||||
"Address per Property Register": clean_listing_address(raw_address),
|
||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue