all is well
This commit is contained in:
parent
eac1bd0d13
commit
2f149503bb
53 changed files with 1543 additions and 354 deletions
|
|
@ -19,6 +19,7 @@ from homecouk import load_cookies as load_homecouk_cookies
|
|||
from homecouk import make_client as make_homecouk_client
|
||||
from homecouk import search_outcode as homecouk_search_outcode
|
||||
from http_client import make_client
|
||||
from listing_filters import matches_strict_buy_listing_filter
|
||||
from rightmove import resolve_outcode_id
|
||||
from rightmove import search_outcode as rightmove_search_outcode
|
||||
from spatial import PostcodeSpatialIndex
|
||||
|
|
@ -181,11 +182,11 @@ def _source_names(sources: str | Iterable[str] | None) -> list[str]:
|
|||
requested = [str(source).strip().lower() for source in sources]
|
||||
|
||||
requested = [source for source in requested if source]
|
||||
if "all" in requested:
|
||||
return list(SOURCE_ORDER)
|
||||
unknown = sorted(set(requested) - set(SOURCE_ORDER))
|
||||
unknown = sorted(set(requested) - set(SOURCE_ORDER) - {"all"})
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown source(s): {', '.join(unknown)}")
|
||||
if "all" in requested:
|
||||
return list(SOURCE_ORDER)
|
||||
return [source for source in SOURCE_ORDER if source in requested]
|
||||
|
||||
|
||||
|
|
@ -196,19 +197,28 @@ def _dedup_key(prop: dict) -> tuple:
|
|||
def _merge_properties(source_results: dict[str, list[dict]]) -> tuple[list[dict], dict, int]:
|
||||
merged: dict[str, dict] = {}
|
||||
seen_keys: set[tuple] = set()
|
||||
seen_ids: set[str] = set()
|
||||
counts = {source: 0 for source in SOURCE_ORDER}
|
||||
deduped = 0
|
||||
|
||||
for source in SOURCE_ORDER:
|
||||
for prop in source_results.get(source, []):
|
||||
prop_id = prop.get("id")
|
||||
key = _dedup_key(prop)
|
||||
if (prop_id is not None and prop_id in merged) or key in seen_keys:
|
||||
deduped += 1
|
||||
continue
|
||||
storage_key = prop_id if prop_id is not None else f"{source}:{len(merged)}"
|
||||
if prop_id is not None:
|
||||
prop_id = str(prop_id)
|
||||
if prop_id in seen_ids:
|
||||
deduped += 1
|
||||
continue
|
||||
seen_ids.add(prop_id)
|
||||
storage_key = prop_id
|
||||
else:
|
||||
key = _dedup_key(prop)
|
||||
if key in seen_keys:
|
||||
deduped += 1
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
storage_key = f"{source}:{len(merged)}"
|
||||
merged[storage_key] = prop
|
||||
seen_keys.add(key)
|
||||
counts[source] += 1
|
||||
|
||||
return list(merged.values()), counts, deduped
|
||||
|
|
@ -241,13 +251,22 @@ def _store_properties(
|
|||
if remaining == 0:
|
||||
return 0
|
||||
|
||||
eligible = [prop for prop in props if _property_is_londonish(prop)]
|
||||
dropped = len(props) - len(eligible)
|
||||
if dropped:
|
||||
londonish = [prop for prop in props if _property_is_londonish(prop)]
|
||||
dropped_outside_area = len(props) - len(londonish)
|
||||
if dropped_outside_area:
|
||||
log.debug(
|
||||
"%s dropped %d properties outside the Greater London-ish postcode filter",
|
||||
source,
|
||||
dropped,
|
||||
dropped_outside_area,
|
||||
)
|
||||
|
||||
eligible = [prop for prop in londonish if matches_strict_buy_listing_filter(prop)]
|
||||
dropped_non_matching = len(londonish) - len(eligible)
|
||||
if dropped_non_matching:
|
||||
log.debug(
|
||||
"%s dropped %d properties outside the strict buy-listing filters",
|
||||
source,
|
||||
dropped_non_matching,
|
||||
)
|
||||
|
||||
selected = eligible if remaining is None else eligible[:remaining]
|
||||
|
|
@ -367,20 +386,16 @@ def _scrape_homecouk(
|
|||
log.info("home.co.uk cap reached")
|
||||
return
|
||||
|
||||
remaining = _source_remaining(
|
||||
results, "homecouk", max_properties_per_source
|
||||
)
|
||||
if remaining == 0:
|
||||
log.info("home.co.uk cap reached")
|
||||
return
|
||||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# home.co.uk cannot express the full filter set at source.
|
||||
# Fetch the outcode page set first; _store_properties applies
|
||||
# the strict filter and source cap after transformation.
|
||||
props = homecouk_search_outcode(
|
||||
client,
|
||||
outcode,
|
||||
pc_index,
|
||||
max_properties=remaining,
|
||||
max_properties=None,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
|
|
@ -442,19 +457,17 @@ def _scrape_zoopla(
|
|||
log.info("Zoopla cap reached")
|
||||
return
|
||||
|
||||
remaining = _source_remaining(results, "zoopla", max_properties_per_source)
|
||||
if remaining == 0:
|
||||
log.info("Zoopla cap reached")
|
||||
return
|
||||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
# Zoopla source-side filters are unverified here. Fetch the
|
||||
# outcode page set first; _store_properties applies the
|
||||
# strict filter and source cap after transformation.
|
||||
props, _ = zoopla_search_outcode(
|
||||
page,
|
||||
outcode,
|
||||
pc_index,
|
||||
pc_coords,
|
||||
max_properties=remaining,
|
||||
max_properties=None,
|
||||
)
|
||||
added = _store_properties(
|
||||
results,
|
||||
|
|
@ -506,9 +519,6 @@ def run_scrape(
|
|||
output_base = Path(output_dir) if output_dir is not None else DATA_DIR
|
||||
output_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if "zoopla" in selected_sources and pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
|
||||
errors: list[str] = []
|
||||
results = {source: [] for source in SOURCE_ORDER}
|
||||
started_at = time.time()
|
||||
|
|
@ -539,7 +549,8 @@ def run_scrape(
|
|||
)
|
||||
|
||||
if "zoopla" in selected_sources:
|
||||
assert pc_coords is not None
|
||||
if pc_coords is None:
|
||||
pc_coords = build_postcode_coords()
|
||||
_scrape_zoopla(
|
||||
selected_outcodes,
|
||||
pc_index,
|
||||
|
|
@ -551,19 +562,36 @@ def run_scrape(
|
|||
|
||||
merged, source_counts, deduped = _merge_properties(results)
|
||||
output_path = output_base / "online_listings_buy.parquet"
|
||||
write_parquet(merged, output_path)
|
||||
if merged:
|
||||
write_parquet(merged, output_path)
|
||||
else:
|
||||
if output_path.exists():
|
||||
output_path.unlink()
|
||||
log.warning("No strict properties to write to %s", output_path)
|
||||
|
||||
filtered = [prop for prop in merged if matches_strict_buy_listing_filter(prop)]
|
||||
filtered_output_path = output_base / "online_listings_buy_filtered.parquet"
|
||||
if filtered:
|
||||
write_parquet(filtered, filtered_output_path)
|
||||
else:
|
||||
if filtered_output_path.exists():
|
||||
filtered_output_path.unlink()
|
||||
log.warning("No strict-filtered properties to write to %s", filtered_output_path)
|
||||
|
||||
counts = {
|
||||
"total": len(merged),
|
||||
"filtered_total": len(filtered),
|
||||
"deduped": deduped,
|
||||
"sources": source_counts,
|
||||
}
|
||||
source_summary = " ".join(
|
||||
f"{source}:{source_counts[source]}" for source in SOURCE_ORDER
|
||||
)
|
||||
log.info(
|
||||
"Sale scrape complete: %d unique (rightmove:%d homecouk:%d zoopla:%d deduped:%d)",
|
||||
"Sale scrape complete: %d unique, %d strict-filtered (%s deduped:%d)",
|
||||
len(merged),
|
||||
source_counts["rightmove"],
|
||||
source_counts["homecouk"],
|
||||
source_counts["zoopla"],
|
||||
len(filtered),
|
||||
source_summary,
|
||||
deduped,
|
||||
)
|
||||
|
||||
|
|
@ -575,6 +603,7 @@ def run_scrape(
|
|||
},
|
||||
"counts": counts,
|
||||
"path": str(output_path),
|
||||
"filtered_path": str(filtered_output_path),
|
||||
"errors": errors,
|
||||
"elapsed_seconds": round(time.time() - started_at, 3),
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue