Small fixes
This commit is contained in:
parent
d93beb9201
commit
7591e5fc05
12 changed files with 198 additions and 14 deletions
|
|
@ -104,6 +104,22 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
if (ptMatch) property_type = ptMatch[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
const lower = text.toLowerCase();
|
||||
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
||||
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
||||
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
||||
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
||||
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
||||
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
||||
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
||||
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
||||
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
||||
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
||||
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
||||
}
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
|
|
@ -172,6 +188,22 @@ _EXTRACT_LISTINGS_JS = r"""() => {
|
|||
if (ptMatch2) property_type = ptMatch2[1].trim();
|
||||
else if (/\bstudio\s*(?:flat|apartment)?\s+(?:for\s+sale|to\s+(?:rent|let)|for\s+rent)/i.test(text)) property_type = 'Studio';
|
||||
|
||||
// Keyword fallback when regex doesn't match current DOM format
|
||||
if (!property_type) {
|
||||
const lower = text.toLowerCase();
|
||||
if (/\bstudio\b/.test(lower)) property_type = 'Studio';
|
||||
else if (/\bpenthouse\b/.test(lower)) property_type = 'Penthouse';
|
||||
else if (/\bmaisonette\b/.test(lower)) property_type = 'Maisonette';
|
||||
else if (/\bapartment\b/.test(lower)) property_type = 'Apartment';
|
||||
else if (/\bflat\b/.test(lower)) property_type = 'Flat';
|
||||
else if (/\bsemi[- ]?detached\b/.test(lower)) property_type = 'Semi-Detached';
|
||||
else if (/\bdetached\b/.test(lower)) property_type = 'Detached';
|
||||
else if (/\bterraced?\b/.test(lower)) property_type = 'Terraced';
|
||||
else if (/\bbungalow\b/.test(lower)) property_type = 'Bungalow';
|
||||
else if (/\bcottage\b/.test(lower)) property_type = 'Cottage';
|
||||
else if (/\bhouse\b/.test(lower)) property_type = 'House';
|
||||
}
|
||||
|
||||
results.push({
|
||||
id, url: href.replace(window.location.origin, ''),
|
||||
price: priceMatch ? parseInt(priceMatch[1].replace(/,/g, '')) : null,
|
||||
|
|
@ -596,10 +628,15 @@ def _resolve_outcode_coords(
|
|||
|
||||
|
||||
def _extract_postcode(text: str) -> str | None:
|
||||
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'."""
|
||||
"""Extract a full UK postcode from text like 'Dollar Bay Place, Canary Wharf E14 9SS'.
|
||||
Normalizes to include a space before the 3-char incode."""
|
||||
match = re.search(r"([A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2})", text, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper().strip()
|
||||
raw = match.group(1).upper().strip()
|
||||
# Ensure space before incode (last 3 chars): "SW1A1AA" → "SW1A 1AA"
|
||||
if " " not in raw and len(raw) >= 5:
|
||||
return raw[:-3] + " " + raw[-3:]
|
||||
return raw
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -651,13 +688,20 @@ def _detect_rent_frequency(price_text: str) -> str:
|
|||
|
||||
Zoopla price elements contain text like '£1,500 pcm', '£350 pw',
|
||||
'£18,000 pa'. Defaults to 'monthly' if no frequency indicator found.
|
||||
|
||||
Checks monthly indicators (pcm) BEFORE weekly (pw) because Zoopla cards
|
||||
often display both monthly and weekly prices in the same text. When the
|
||||
JS extraction falls back to full card text, checking pcm first ensures
|
||||
the captured monthly price gets the correct frequency label.
|
||||
"""
|
||||
lower = price_text.lower()
|
||||
if "pcm" in lower or "per month" in lower or "per calendar month" in lower:
|
||||
return "monthly"
|
||||
if "pw" in lower or "per week" in lower or "/w" in lower:
|
||||
return "weekly"
|
||||
if "pa" in lower or "per annum" in lower or "/y" in lower or "per year" in lower:
|
||||
return "yearly"
|
||||
# pcm, per month, /m, or no indicator — default monthly
|
||||
# No indicator — default monthly (Zoopla standard)
|
||||
return "monthly"
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue