Fix scrape
This commit is contained in:
parent
bbc2fcb86c
commit
3adbaf435d
2 changed files with 131 additions and 27 deletions
|
|
@ -39,7 +39,7 @@ class TurnstileError(Exception):
|
|||
|
||||
|
||||
# Maximum search result pages to scrape per outcode (25 listings/page)
|
||||
MAX_PAGES_PER_OUTCODE = 10
|
||||
MAX_PAGES_PER_OUTCODE = 40
|
||||
|
||||
# JavaScript to extract listings from the rendered DOM.
|
||||
# Uses data-testid attributes as primary selectors (stable across deployments),
|
||||
|
|
@ -98,6 +98,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
// Extract property type (e.g., "2 bed flat for sale" → "flat")
|
||||
let property_type = '';
|
||||
const ptMatch = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
if (ptMatch) property_type = ptMatch[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
|
|
@ -106,7 +112,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -160,6 +166,12 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
if (/leasehold/i.test(text)) tenure = 'Leasehold';
|
||||
else if (/freehold/i.test(text)) tenure = 'Freehold';
|
||||
|
||||
// Extract property type
|
||||
let property_type = '';
|
||||
const ptMatch2 = text.match(/\d+\s*(?:beds?|bedrooms?)\s+([\w\s-]+?)\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i);
|
||||
if (ptMatch2) property_type = ptMatch2[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
|
|
@ -168,7 +180,7 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
baths: bathsMatch && parseInt(bathsMatch[1]) <= 20 ? parseInt(bathsMatch[1]) : null,
|
||||
receptions: recMatch && parseInt(recMatch[1]) <= 20 ? parseInt(recMatch[1]) : null,
|
||||
floor_area_sqft: areaMatch ? parseInt(areaMatch[1].replace(/,/g, '')) : null,
|
||||
address, tenure,
|
||||
address, tenure, property_type,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
@ -557,6 +569,32 @@ def _paginate(page, total_results: int, channel: str) -> list[dict]:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Cached outcode → (postcode, lat, lng) lookups to avoid repeated O(n) scans
|
||||
# over 2.26M postcodes. Populated lazily on first lookup per outcode.
|
||||
_outcode_coords_cache: dict[str, tuple[str, float, float] | None] = {}
|
||||
|
||||
|
||||
def _resolve_outcode_coords(
|
||||
outcode: str, pc_coords: dict[str, tuple[float, float]]
|
||||
) -> tuple[str, float, float] | None:
|
||||
"""Find first postcode + coords for an outcode. Result is cached."""
|
||||
if outcode in _outcode_coords_cache:
|
||||
return _outcode_coords_cache[outcode]
|
||||
|
||||
prefix = outcode + " "
|
||||
for pcd, (lat, lng) in pc_coords.items():
|
||||
if pcd.startswith(prefix) or (
|
||||
len(outcode) >= 4
|
||||
and pcd.startswith(outcode)
|
||||
and len(pcd) > len(outcode)
|
||||
):
|
||||
_outcode_coords_cache[outcode] = (pcd, lat, lng)
|
||||
return (pcd, lat, lng)
|
||||
|
||||
_outcode_coords_cache[outcode] = None
|
||||
return None
|
||||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
|
|
@ -585,11 +623,17 @@ def _map_property_type(raw_type: str | None) -> str:
|
|||
"""Map Zoopla property type text to canonical type."""
|
||||
if not raw_type:
|
||||
return "Other"
|
||||
# Exact match (handles Rightmove-style capitalised values)
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type)
|
||||
if canonical:
|
||||
return canonical
|
||||
# Title-case match (handles regex-extracted lowercase like "town house" → "Town House")
|
||||
canonical = PROPERTY_TYPE_MAP.get(raw_type.title())
|
||||
if canonical:
|
||||
return canonical
|
||||
# Keyword fallback
|
||||
lower = raw_type.lower()
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower:
|
||||
if "flat" in lower or "apartment" in lower or "maisonette" in lower or "studio" in lower or "penthouse" in lower:
|
||||
return "Flats/Maisonettes"
|
||||
if "detached" in lower and "semi" not in lower:
|
||||
return "Detached"
|
||||
|
|
@ -622,6 +666,7 @@ def transform_property(
|
|||
channel: str,
|
||||
pc_index: PostcodeSpatialIndex,
|
||||
pc_coords: dict[str, tuple[float, float]],
|
||||
search_outcode: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Transform a raw Zoopla listing dict into the standard output schema.
|
||||
|
||||
|
|
@ -643,22 +688,18 @@ def transform_property(
|
|||
lat, lng = coords
|
||||
|
||||
if lat is None:
|
||||
# Try outcode-level fallback
|
||||
outcode = _extract_outcode(address)
|
||||
if outcode:
|
||||
# ONSPD 7-char format: 4-char outcodes have no space before incode
|
||||
# (e.g., "BH191AB"), while shorter outcodes do (e.g., "E14 5AB").
|
||||
# Check both formats to handle all outcode lengths.
|
||||
prefix = outcode + " "
|
||||
for pcd, coords in pc_coords.items():
|
||||
if pcd.startswith(prefix) or (
|
||||
len(outcode) >= 4
|
||||
and pcd.startswith(outcode)
|
||||
and len(pcd) > len(outcode)
|
||||
):
|
||||
postcode = pcd
|
||||
lat, lng = coords
|
||||
break
|
||||
# Try outcode-level fallback from address text
|
||||
addr_outcode = _extract_outcode(address)
|
||||
if addr_outcode:
|
||||
result = _resolve_outcode_coords(addr_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
|
||||
# Final fallback: use the outcode we know we're searching
|
||||
if lat is None and search_outcode:
|
||||
result = _resolve_outcode_coords(search_outcode, pc_coords)
|
||||
if result:
|
||||
postcode, lat, lng = result
|
||||
|
||||
if lat is None or lng is None or not postcode:
|
||||
return None
|
||||
|
|
@ -706,8 +747,8 @@ def transform_property(
|
|||
"Postcode": postcode,
|
||||
"Address per Property Register": address,
|
||||
"Leasehold/Freehold": raw.get("tenure") or None,
|
||||
"Property type": "Other", # Not reliably extractable from Zoopla search cards
|
||||
"Property sub-type": "",
|
||||
"Property type": _map_property_type(raw.get("property_type")),
|
||||
"Property sub-type": raw.get("property_type") or "",
|
||||
"price": int(price),
|
||||
"price_frequency": frequency,
|
||||
"Price qualifier": "",
|
||||
|
|
@ -774,7 +815,7 @@ def search_outcode(
|
|||
properties = []
|
||||
dropped = 0
|
||||
for raw in raw_listings:
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords)
|
||||
transformed = transform_property(raw, channel, pc_index, pc_coords, search_outcode=outcode)
|
||||
if transformed:
|
||||
properties.append(transformed)
|
||||
zoopla_properties_scraped.labels(channel=channel_label).inc()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue