scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

113
finder/test_rightmove.py Normal file
View file

@ -0,0 +1,113 @@
"""Tests for the Rightmove detail-page postcode extractor.
The search API only returns an outcode-level ``displayAddress``; the property's
TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
graph whose ``data`` field is a JSON STRING of a flat array where every integer
inside a container is an index reference into that same array.
"""
import json
from rightmove import _extract_page_model_literal, parse_detail_postcode
def _page_model_html(flat: list, *, encoding: str = "json") -> str:
"""Wrap a flattened object-graph array in a realistic detail-page <script>.
Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
where the array is itself JSON-encoded (so its quotes arrive escaped)."""
outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
return (
"<html><head></head><body>\n"
"<script>\n"
" window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
"</script>\n"
"</body></html>"
)
# A faithful slice of a real listing: root -> propertyData -> address, with a
# decoy nearestStations array (which carries NO postcodes on the live page) to
# prove the parser anchors on the property's own address, not a nearby POI.
_FLAT_SW9 = [
{"propertyData": 1}, # 0: root
{
"id": "89089584",
"address": 2,
"location": 4,
"nearestStations": 6,
}, # 1: propertyData
{
"displayAddress": "Caldwell Street, Stockwell",
"countryCode": "GB",
"ukCountry": "England",
"outcode": "SW9",
"incode": "0HD",
}, # 2: address
None, # 3: filler
{
"latitude": 51.477238,
"longitude": -0.116819,
"pinType": "ACCURATE_POINT",
}, # 4: location
None, # 5: filler
[7, 8], # 6: nearestStations (references)
{"name": "Oval Station", "distance": 0.36}, # 7: station, no postcode
{"name": "Stockwell Station", "distance": 0.41}, # 8: station, no postcode
]
def test_parses_full_postcode_from_outcode_and_incode() -> None:
html = _page_model_html(_FLAT_SW9)
assert parse_detail_postcode(html) == "SW9 0HD"
def test_extract_page_model_literal_brace_matches_nested_object() -> None:
# The literal must include the whole nested object, not stop at the first
# closing brace inside the escaped data string.
html = _page_model_html(_FLAT_SW9)
literal = _extract_page_model_literal(html)
assert literal is not None
assert literal.startswith("{") and literal.endswith("}")
# Round-trips back to a dict with the expected top-level keys.
assert set(json.loads(literal)) == {"data", "encoding"}
def test_normalises_unspaced_incode() -> None:
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
def test_returns_none_when_address_missing() -> None:
# The location wrapper can be empty/absent on some listings; the caller then
# keeps the coordinate fallback, so we must return None (not raise).
flat = [
{"propertyData": 1},
{"id": "1", "location": 2},
{"latitude": 51.5, "longitude": -0.1},
]
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_when_incode_blank() -> None:
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "incode": ""}
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_for_non_postcode_pair() -> None:
# A structurally-invalid outcode/incode pair is rejected by the validator.
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
assert parse_detail_postcode(_page_model_html(flat)) is None
def test_returns_none_without_page_model() -> None:
assert parse_detail_postcode("") is None
assert parse_detail_postcode("<html><body>no model</body></html>") is None
# Malformed JSON in the data field degrades gracefully.
broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
assert parse_detail_postcode(broken) is None