113 lines
4.4 KiB
Python
113 lines
4.4 KiB
Python
"""Tests for the Rightmove detail-page postcode extractor.
|
|
|
|
The search API only returns an outcode-level ``displayAddress``; the property's
|
|
TRUE full postcode lives on its detail page inside ``window.__PAGE_MODEL`` as
|
|
``propertyData.address.{outcode, incode}``. ``parse_detail_postcode`` recovers
|
|
it. These tests build a faithful __PAGE_MODEL: a devalue-style flattened object
|
|
graph whose ``data`` field is a JSON STRING of a flat array where every integer
|
|
inside a container is an index reference into that same array.
|
|
"""
|
|
|
|
import json
|
|
|
|
from rightmove import _extract_page_model_literal, parse_detail_postcode
|
|
|
|
|
|
def _page_model_html(flat: list, *, encoding: str = "json") -> str:
|
|
"""Wrap a flattened object-graph array in a realistic detail-page <script>.
|
|
|
|
Mirrors the live page: ``window.__PAGE_MODEL = {"data": "<json array>"}``
|
|
where the array is itself JSON-encoded (so its quotes arrive escaped)."""
|
|
outer = {"data": json.dumps(flat, separators=(",", ":")), "encoding": encoding}
|
|
return (
|
|
"<html><head></head><body>\n"
|
|
"<script>\n"
|
|
" window.__PAGE_MODEL = " + json.dumps(outer, separators=(",", ":")) + ";\n"
|
|
"</script>\n"
|
|
"</body></html>"
|
|
)
|
|
|
|
|
|
# A faithful slice of a real listing: root -> propertyData -> address, with a
|
|
# decoy nearestStations array (which carries NO postcodes on the live page) to
|
|
# prove the parser anchors on the property's own address, not a nearby POI.
|
|
_FLAT_SW9 = [
|
|
{"propertyData": 1}, # 0: root
|
|
{
|
|
"id": "89089584",
|
|
"address": 2,
|
|
"location": 4,
|
|
"nearestStations": 6,
|
|
}, # 1: propertyData
|
|
{
|
|
"displayAddress": "Caldwell Street, Stockwell",
|
|
"countryCode": "GB",
|
|
"ukCountry": "England",
|
|
"outcode": "SW9",
|
|
"incode": "0HD",
|
|
}, # 2: address
|
|
None, # 3: filler
|
|
{
|
|
"latitude": 51.477238,
|
|
"longitude": -0.116819,
|
|
"pinType": "ACCURATE_POINT",
|
|
}, # 4: location
|
|
None, # 5: filler
|
|
[7, 8], # 6: nearestStations (references)
|
|
{"name": "Oval Station", "distance": 0.36}, # 7: station, no postcode
|
|
{"name": "Stockwell Station", "distance": 0.41}, # 8: station, no postcode
|
|
]
|
|
|
|
|
|
def test_parses_full_postcode_from_outcode_and_incode() -> None:
|
|
html = _page_model_html(_FLAT_SW9)
|
|
assert parse_detail_postcode(html) == "SW9 0HD"
|
|
|
|
|
|
def test_extract_page_model_literal_brace_matches_nested_object() -> None:
|
|
# The literal must include the whole nested object, not stop at the first
|
|
# closing brace inside the escaped data string.
|
|
html = _page_model_html(_FLAT_SW9)
|
|
literal = _extract_page_model_literal(html)
|
|
assert literal is not None
|
|
assert literal.startswith("{") and literal.endswith("}")
|
|
# Round-trips back to a dict with the expected top-level keys.
|
|
assert set(json.loads(literal)) == {"data", "encoding"}
|
|
|
|
|
|
def test_normalises_unspaced_incode() -> None:
|
|
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
|
flat[2] = {**_FLAT_SW9[2], "outcode": "e20", "incode": "1fh"}
|
|
assert parse_detail_postcode(_page_model_html(flat)) == "E20 1FH"
|
|
|
|
|
|
def test_returns_none_when_address_missing() -> None:
|
|
# The location wrapper can be empty/absent on some listings; the caller then
|
|
# keeps the coordinate fallback, so we must return None (not raise).
|
|
flat = [
|
|
{"propertyData": 1},
|
|
{"id": "1", "location": 2},
|
|
{"latitude": 51.5, "longitude": -0.1},
|
|
]
|
|
assert parse_detail_postcode(_page_model_html(flat)) is None
|
|
|
|
|
|
def test_returns_none_when_incode_blank() -> None:
|
|
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
|
flat[2] = {**_FLAT_SW9[2], "incode": ""}
|
|
assert parse_detail_postcode(_page_model_html(flat)) is None
|
|
|
|
|
|
def test_returns_none_for_non_postcode_pair() -> None:
|
|
# A structurally-invalid outcode/incode pair is rejected by the validator.
|
|
flat = [dict(node) if isinstance(node, dict) else node for node in _FLAT_SW9]
|
|
flat[2] = {**_FLAT_SW9[2], "outcode": "NOTAPC", "incode": "ZZ"}
|
|
assert parse_detail_postcode(_page_model_html(flat)) is None
|
|
|
|
|
|
def test_returns_none_without_page_model() -> None:
|
|
assert parse_detail_postcode("") is None
|
|
assert parse_detail_postcode("<html><body>no model</body></html>") is None
|
|
# Malformed JSON in the data field degrades gracefully.
|
|
broken = '<script>window.__PAGE_MODEL = {"data":"[not json"};</script>'
|
|
assert parse_detail_postcode(broken) is None
|