Improve scraping
This commit is contained in:
parent
89a85e9a0c
commit
c14d28f430
7 changed files with 91 additions and 25 deletions
|
|
@ -7,21 +7,24 @@ from spatial import PostcodeSpatialIndex
|
|||
log = logging.getLogger("rightmove")
|
||||
|
||||
|
||||
# Maximum plausible floor area for a residential property listing (sqm).
|
||||
# ~21,500 sq ft — covers even the largest UK mansions.
|
||||
# Floor area bounds (sqm). Values outside this range are almost certainly
|
||||
# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
|
||||
# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
|
||||
# UK mansions.
|
||||
MIN_FLOOR_AREA_SQM = 5.0
|
||||
MAX_FLOOR_AREA_SQM = 2000.0
|
||||
|
||||
|
||||
def validate_floor_area(sqm: float | None) -> float | None:
|
||||
"""Validate a floor area value. Returns None for nonsensical values.
|
||||
|
||||
Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
|
||||
Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
|
||||
which catches parsing errors where prices or other large numbers are
|
||||
mistakenly extracted as floor area from free-text descriptions or DOM text.
|
||||
"""
|
||||
if sqm is None:
|
||||
return None
|
||||
if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
|
||||
if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
|
||||
return None
|
||||
return sqm
|
||||
|
||||
|
|
@ -42,6 +45,25 @@ def parse_display_size(display_size: str | None) -> float | None:
|
|||
return None
|
||||
|
||||
|
||||
def normalize_sub_type(sub_type: str | None) -> str:
|
||||
"""Normalize property sub-type for consistent storage.
|
||||
|
||||
Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
|
||||
home.co.uk and truncates Zoopla description fragments that were
|
||||
accidentally captured as sub-types.
|
||||
"""
|
||||
if not sub_type:
|
||||
return "Unknown"
|
||||
cleaned = sub_type.replace("_", " ").strip()
|
||||
# Description fragments captured as sub-types are much longer than any
|
||||
# real property type name (longest canonical is ~25 chars)
|
||||
if len(cleaned) > 40:
|
||||
return "Unknown"
|
||||
# Collapse multiple spaces
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
return cleaned.title()
|
||||
|
||||
|
||||
def map_property_type(sub_type: str | None) -> str:
|
||||
"""Map propertySubType to canonical type."""
|
||||
if not sub_type:
|
||||
|
|
@ -51,6 +73,15 @@ def map_property_type(sub_type: str | None) -> str:
|
|||
return canonical
|
||||
# Try title-case variant (e.g., "country house" → "Country House")
|
||||
canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Try lowercase variant (e.g., "Townhouse" → "townhouse")
|
||||
canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Normalize delimiters (underscores/hyphens → spaces) and try again
|
||||
normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
|
||||
canonical = PROPERTY_TYPE_MAP.get(normalized)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Keyword fallback for compound types not in the map
|
||||
|
|
@ -103,12 +134,13 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:
|
|||
|
||||
|
||||
def normalize_postcode(postcode: str) -> str:
|
||||
"""Ensure UK postcode has a space before the 3-char incode.
|
||||
E.g., 'SW1A1AA' → 'SW1A 1AA', 'E1 4AB' unchanged."""
|
||||
postcode = postcode.strip().upper()
|
||||
if " " in postcode or len(postcode) < 5:
|
||||
return postcode
|
||||
return postcode[:-3] + " " + postcode[-3:]
|
||||
"""Ensure UK postcode has exactly one space before the 3-char incode.
|
||||
E.g., 'SW1A1AA' → 'SW1A 1AA', 'N4 2HA' → 'N4 2HA', 'E1 4AB' unchanged."""
|
||||
# Strip all whitespace then re-insert the single canonical space
|
||||
compact = re.sub(r"\s+", "", postcode).upper()
|
||||
if len(compact) < 5:
|
||||
return compact
|
||||
return compact[:-3] + " " + compact[-3:]
|
||||
|
||||
|
||||
def normalize_price(amount: int, frequency: str) -> int:
|
||||
|
|
@ -187,7 +219,7 @@ def transform_property(
|
|||
"Address per Property Register": prop.get("displayAddress", ""),
|
||||
"Leasehold/Freehold": extract_tenure(prop.get("tenure")),
|
||||
"Property type": map_property_type(sub_type),
|
||||
"Property sub-type": sub_type or "Unknown",
|
||||
"Property sub-type": normalize_sub_type(sub_type),
|
||||
"price": price,
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": price_qualifier,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue