scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -1,13 +1,19 @@
|
|||
from transform import (
|
||||
build_register_address,
|
||||
clean_listing_address,
|
||||
extract_full_postcode,
|
||||
extract_outcode,
|
||||
resolve_listing_postcode,
|
||||
transform_property,
|
||||
)
|
||||
|
||||
|
||||
class StubPostcodeIndex:
|
||||
def __init__(self, postcode: str = "SW1A 9ZZ") -> None:
|
||||
self._postcode = postcode
|
||||
|
||||
def nearest(self, lat: float, lng: float) -> str:
|
||||
return "SW1A 9ZZ"
|
||||
return self._postcode
|
||||
|
||||
|
||||
def test_extract_full_postcode_normalizes_spacing() -> None:
|
||||
|
|
@ -24,6 +30,46 @@ def test_clean_listing_address_removes_postcode_and_outcode_suffixes() -> None:
|
|||
assert clean_listing_address("Kings Avenue, Bromley") == "Kings Avenue, Bromley"
|
||||
|
||||
|
||||
def test_build_register_address_prepends_house_number_or_name() -> None:
|
||||
# House number/name prepended, with the trailing outcode/postcode stripped.
|
||||
assert (
|
||||
build_register_address("South Street, Bromley BR1", "12")
|
||||
== "12, South Street, Bromley"
|
||||
)
|
||||
assert (
|
||||
build_register_address("Riverside, Martham NR29", "Martham Mill")
|
||||
== "Martham Mill, Riverside, Martham"
|
||||
)
|
||||
# No number/name -> identical to the plain cleaned address.
|
||||
assert build_register_address("Kings Avenue, Bromley", None) == "Kings Avenue, Bromley"
|
||||
# Already starts with the number/name -> no duplication.
|
||||
assert (
|
||||
build_register_address("12 South Street, Bromley", "12")
|
||||
== "12 South Street, Bromley"
|
||||
)
|
||||
# Empty/whitespace number/name is ignored.
|
||||
assert build_register_address("Kings Avenue, Bromley", " ") == "Kings Avenue, Bromley"
|
||||
|
||||
|
||||
def test_extract_outcode() -> None:
|
||||
assert extract_outcode("SW1A 2AA") == "SW1A"
|
||||
assert extract_outcode("n4 2ha") == "N4"
|
||||
assert extract_outcode("SW1A2AA") == "SW1A"
|
||||
assert extract_outcode(None) is None
|
||||
assert extract_outcode("") is None
|
||||
|
||||
|
||||
def test_resolve_listing_postcode() -> None:
|
||||
# Outcode matches -> trust the more precise extracted postcode.
|
||||
assert resolve_listing_postcode("SW1A 2AA", "SW1A 9ZZ") == ("SW1A 2AA", "address")
|
||||
# Outcode mismatch -> fall back to the spatially-correct inferred postcode.
|
||||
assert resolve_listing_postcode("E14 9SS", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||
# Well-formed but fabricated postcode in a different outcode is rejected.
|
||||
assert resolve_listing_postcode("ZZ9 9ZZ", "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||
# No extracted postcode -> inferred is authoritative.
|
||||
assert resolve_listing_postcode(None, "SW1A 9ZZ") == ("SW1A 9ZZ", "coordinates")
|
||||
|
||||
|
||||
def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
||||
prop = {
|
||||
"id": "123",
|
||||
|
|
@ -46,3 +92,84 @@ def test_rightmove_transform_prefers_postcode_from_display_address() -> None:
|
|||
assert result["Inferred postcode"] == "SW1A 9ZZ"
|
||||
assert result["Listing raw address"] == "Flat 2, 10 Downing Street, SW1A 2AA"
|
||||
assert result["Address per Property Register"] == "Flat 2, 10 Downing Street"
|
||||
|
||||
|
||||
def test_rightmove_transform_rejects_postcode_from_wrong_outcode() -> None:
|
||||
prop = {
|
||||
"id": "124",
|
||||
"location": {"latitude": 51.5, "longitude": -0.1},
|
||||
"price": {"amount": 750000, "displayPrices": []},
|
||||
"propertySubType": "Terraced",
|
||||
"bedrooms": 3,
|
||||
"bathrooms": 1,
|
||||
"keyFeatures": [],
|
||||
"propertyUrl": "/properties/124",
|
||||
# Address postcode is in a different outcode than the coordinate-nearest one.
|
||||
"displayAddress": "10 Downing Street, E14 9SS",
|
||||
}
|
||||
|
||||
result = transform_property(prop, "SW1A", StubPostcodeIndex())
|
||||
|
||||
assert result is not None
|
||||
# The spatially-correct inferred postcode wins over the mismatching extracted one.
|
||||
assert result["Postcode"] == "SW1A 9ZZ"
|
||||
assert result["Postcode source"] == "coordinates"
|
||||
assert result["Extracted postcode"] == "E14 9SS"
|
||||
|
||||
|
||||
def _rightmove_prop() -> dict:
|
||||
return {
|
||||
"id": "200",
|
||||
"location": {"latitude": 51.5, "longitude": -0.1},
|
||||
"price": {"amount": 750000, "displayPrices": []},
|
||||
"propertySubType": "Terraced",
|
||||
"bedrooms": 3,
|
||||
"bathrooms": 1,
|
||||
"keyFeatures": [],
|
||||
"propertyUrl": "/properties/200",
|
||||
# Search API only ever exposes the outcode in the display address.
|
||||
"displayAddress": "Caldwell Street, Stockwell, SW9",
|
||||
}
|
||||
|
||||
|
||||
def test_rightmove_transform_prefers_detail_postcode() -> None:
|
||||
# The detail page's true full postcode (same outcode as the location) is
|
||||
# preferred over the coordinate-nearest guess.
|
||||
result = transform_property(
|
||||
_rightmove_prop(),
|
||||
"SW9",
|
||||
StubPostcodeIndex("SW9 7AA"),
|
||||
detail_postcode="SW9 0HD",
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW9 0HD"
|
||||
assert result["Postcode source"] == "detail_address"
|
||||
# The coordinate inference is still surfaced separately.
|
||||
assert result["Inferred postcode"] == "SW9 7AA"
|
||||
|
||||
|
||||
def test_rightmove_transform_rejects_detail_postcode_from_wrong_outcode() -> None:
|
||||
# A detail postcode whose outcode disagrees with the location must not
|
||||
# relocate the listing; the coordinate postcode wins instead.
|
||||
result = transform_property(
|
||||
_rightmove_prop(),
|
||||
"SW9",
|
||||
StubPostcodeIndex("SW9 7AA"),
|
||||
detail_postcode="E14 9SS",
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW9 7AA"
|
||||
assert result["Postcode source"] == "coordinates"
|
||||
|
||||
|
||||
def test_rightmove_transform_without_detail_keeps_coordinate_logic() -> None:
|
||||
# No detail postcode -> behaviour is unchanged (coordinate-nearest).
|
||||
result = transform_property(
|
||||
_rightmove_prop(), "SW9", StubPostcodeIndex("SW9 7AA")
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result["Postcode"] == "SW9 7AA"
|
||||
assert result["Postcode source"] == "coordinates"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue