Improve scraping

2026-03-29 11:41:35 +01:00 · 2026-03-29 11:41:35 +01:00 · c14d28f430
commit c14d28f430
parent 89a85e9a0c
7 changed files with 91 additions and 25 deletions
--- a/finder/transform.py
+++ b/finder/transform.py
@ -7,21 +7,24 @@ from spatial import PostcodeSpatialIndex
 log = logging.getLogger("rightmove")


-# Maximum plausible floor area for a residential property listing (sqm).
-# ~21,500 sq ft — covers even the largest UK mansions.
+# Floor area bounds (sqm). Values outside this range are almost certainly
+# data errors: sub-5 sqm catches garbled extractions (e.g., 0.1 sqm for a
+# detached house), and >2000 sqm (~21,500 sq ft) exceeds even the largest
+# UK mansions.
+MIN_FLOOR_AREA_SQM = 5.0
 MAX_FLOOR_AREA_SQM = 2000.0


 def validate_floor_area(sqm: float | None) -> float | None:
    """Validate a floor area value. Returns None for nonsensical values.

-    Rejects zero/negative values and anything above MAX_FLOOR_AREA_SQM,
+    Rejects values below MIN_FLOOR_AREA_SQM and above MAX_FLOOR_AREA_SQM,
    which catches parsing errors where prices or other large numbers are
    mistakenly extracted as floor area from free-text descriptions or DOM text.
    """
    if sqm is None:
        return None
-    if sqm <= 0 or sqm > MAX_FLOOR_AREA_SQM:
+    if sqm < MIN_FLOOR_AREA_SQM or sqm > MAX_FLOOR_AREA_SQM:
        return None
    return sqm

@ -42,6 +45,25 @@ def parse_display_size(display_size: str | None) -> float | None:
    return None


+def normalize_sub_type(sub_type: str | None) -> str:
+    """Normalize property sub-type for consistent storage.
+
+    Fixes delimiter inconsistencies (underscores/hyphens → spaces) from
+    home.co.uk and truncates Zoopla description fragments that were
+    accidentally captured as sub-types.
+    """
+    if not sub_type:
+        return "Unknown"
+    cleaned = sub_type.replace("_", " ").strip()
+    # Description fragments captured as sub-types are much longer than any
+    # real property type name (longest canonical is ~25 chars)
+    if len(cleaned) > 40:
+        return "Unknown"
+    # Collapse multiple spaces
+    cleaned = re.sub(r"\s+", " ", cleaned)
+    return cleaned.title()
+
+
 def map_property_type(sub_type: str | None) -> str:
    """Map propertySubType to canonical type."""
    if not sub_type:
@ -51,6 +73,15 @@ def map_property_type(sub_type: str | None) -> str:
        return canonical
    # Try title-case variant (e.g., "country house" → "Country House")
    canonical = PROPERTY_TYPE_MAP.get(sub_type.title())
+    if canonical:
+        return canonical
+    # Try lowercase variant (e.g., "Townhouse" → "townhouse")
+    canonical = PROPERTY_TYPE_MAP.get(sub_type.lower())
+    if canonical:
+        return canonical
+    # Normalize delimiters (underscores/hyphens → spaces) and try again
+    normalized = re.sub(r"[-_]+", " ", sub_type).strip().title()
+    canonical = PROPERTY_TYPE_MAP.get(normalized)
    if canonical:
        return canonical
    # Keyword fallback for compound types not in the map
@ -103,12 +134,13 @@ def fix_coords(lat: float, lng: float) -> tuple[float, float]:


 def normalize_postcode(postcode: str) -> str:
-    """Ensure UK postcode has a space before the 3-char incode.
-    E.g., 'SW1A1AA' → 'SW1A 1AA', 'E1 4AB' unchanged."""
-    postcode = postcode.strip().upper()
-    if " " in postcode or len(postcode) < 5:
-        return postcode
-    return postcode[:-3] + " " + postcode[-3:]
+    """Ensure UK postcode has exactly one space before the 3-char incode.
+    E.g., 'SW1A1AA' → 'SW1A 1AA', 'N4  2HA' → 'N4 2HA', 'E1 4AB' unchanged."""
+    # Strip all whitespace then re-insert the single canonical space
+    compact = re.sub(r"\s+", "", postcode).upper()
+    if len(compact) < 5:
+        return compact
+    return compact[:-3] + " " + compact[-3:]


 def normalize_price(amount: int, frequency: str) -> int:
@ -187,7 +219,7 @@ def transform_property(
        "Address per Property Register": prop.get("displayAddress", ""),
        "Leasehold/Freehold": extract_tenure(prop.get("tenure")),
        "Property type": map_property_type(sub_type),
-        "Property sub-type": sub_type or "Unknown",
+        "Property sub-type": normalize_sub_type(sub_type),
        "price": price,
        "price_frequency": frequency,
        "Price qualifier": price_qualifier,