Improve scraping

This commit is contained in:
Andras Schmelczer 2026-03-29 11:41:35 +01:00
parent 89a85e9a0c
commit c14d28f430
7 changed files with 91 additions and 25 deletions

View file

@ -7,21 +7,24 @@ from spatial import PostcodeSpatialIndex
log = logging.getLogger("rightmove")
# Maximum plausible floor area for a residential property listing (sqm).
# ~21,500 sq ft — covers even the largest UK mansions.
# Floor area bounds (sqm). Values outside this range are almost certainly
# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
# UK mansions.
MIN_FLOOR_AREA_SQM = 5.0
MAX_FLOOR_AREA_SQM = 2000.0
def validate_floor_area(sqm: float | None) -> float | None:
"""Validate a floor area value. Returns None for nonsensical values.
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
which catches parsing errors where prices or other large numbers are
mistakenly extracted as floor area from free-text descriptions or DOM text.
"""
if sqm is None:
return None
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
return None
return sqm
@ -42,6 +45,25 @@ def parse_display_size(display_size: str | None) -> float | None:
return None
def normalize_sub_type(sub_type: str | None) -> str:
"""Normalize property sub-type for consistent storage.
Fixes delimiter inconsistencies (underscores/hyphens spaces) from
home.co.uk and truncates Zoopla description fragments that were
accidentally captured as sub-types.
"""
if not sub_type:
return "Unknown"
cleaned = sub_type.replace("_", " ").strip()
# Description fragments captured as sub-types are much longer than any
# real property type name (longest canonical is ~25 chars)
if len(cleaned) > 40:
return "Unknown"
# Collapse multiple spaces
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned.title()
def map_property_type(sub_type: str | None) -> str:
"""Map propertySubType to canonical type."""
if not sub_type:
@ -51,6 +73,15 @@ def map_property_type(sub_type: str | None) -> str:
return canonical
# Try title-case variant (e.g., "country house" → "Country House")
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
if canonical:
return canonical
# Try lowercase variant (e.g., "Townhouse" → "townhouse")
canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
if canonical:
return canonical
# Normalize delimiters (underscores/hyphens → spaces) and try again
normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
canonical = PROPERTY_TYPE_MAP.get(normalized)
if canonical:
return canonical
# Keyword fallback for compound types not in the map
@ -103,12 +134,13 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
def normalize_postcode(postcode: str) -> str:
"""Ensure UK postcode has a space before the 3-char incode.
E.g., 'SW1A1AA' 'SW1A 1AA', 'E1 4AB' unchanged."""
postcode = postcode.strip().upper()
if " " in postcode or len(postcode) < 5:
return postcode
return postcode[:-3] + " " + postcode[-3:]
"""Ensure UK postcode has exactly one space before the 3-char incode.
E.g., 'SW1A1AA' 'SW1A 1AA', 'N4 2HA' 'N4 2HA', 'E1 4AB' unchanged."""
# Strip all whitespace then re-insert the single canonical space
compact = re.sub(r"\s+", "", postcode).upper()
if len(compact) < 5:
return compact
return compact[:-3] + " " + compact[-3:]
def normalize_price(amount: int, frequency: str) -> int:
@ -187,7 +219,7 @@ def transform_property(
"Address per Property Register": prop.get("displayAddress", ""),
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
"Property type": map_property_type(sub_type),
"Property sub-type": sub_type or "Unknown",
"Property sub-type": normalize_sub_type(sub_type),
"price": price,
"price_frequency": frequency,
"Price qualifier": price_qualifier,