Small fixes

This commit is contained in:
Andras Schmelczer 2026-03-28 09:29:56 +00:00
parent d93beb9201
commit 7591e5fc05
12 changed files with 198 additions and 14 deletions

View file

@ -104,6 +104,22 @@ _EXTRACT_LISTINGS_JS = r"""() => {
if (ptMatch) property_type = ptMatch[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
// Keyword fallback when regex doesn't match current DOM format
if (!property_type) {
const lower = text.toLowerCase();
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
else if (/\bhouse\b/.test(lower)) property_type = 'House';
}
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
@ -172,6 +188,22 @@ _EXTRACT_LISTINGS_JS = r"""() => {
if (ptMatch2) property_type = ptMatch2[1].trim();
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
// Keyword fallback when regex doesn't match current DOM format
if (!property_type) {
const lower = text.toLowerCase();
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
else if (/\bhouse\b/.test(lower)) property_type = 'House';
}
results.push({
id, url: href.replace(window.location.origin, ''),
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
@ -596,10 +628,15 @@ def _resolve_outcode_coords(
def _extract_postcode(text: str) -> str | None:
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.
Normalizes to include a space before the 3-char incode."""
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
if match:
return match.group(1).upper().strip()
raw = match.group(1).upper().strip()
# Ensure space before incode (last 3 chars): "SW1A1AA" → "SW1A 1AA"
if " " not in raw and len(raw) >= 5:
return raw[:-3] + " " + raw[-3:]
return raw
return None
@ -651,13 +688,20 @@ def _detect_rent_frequency(price_text: str) -> str:
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
Checks monthly indicators (pcm) BEFORE weekly (pw) because Zoopla cards
often display both monthly and weekly prices in the same text. When the
JS extraction falls back to full card text, checking pcm first ensures
the captured monthly price gets the correct frequency label.
"""
lower = price_text.lower()
if "pcm" in lower or "per month" in lower or "per calendar month" in lower:
return "monthly"
if "pw" in lower or "per week" in lower or "/w" in lower:
return "weekly"
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
return "yearly"
# pcm, per month, /m, or no indicator — default monthly
# No indicator — default monthly (Zoopla standard)
return "monthly"