perfect-postcode/finder/http_client.py
2026-03-15 21:22:28 +00:00

158 lines
5.7 KiB
Python

import logging
import random
import threading
import time
import httpx
from fake_useragent import UserAgent
from constants import MAX_RETRIES, RETRY_BASE_DELAY
from metrics import http_errors_total, http_requests_total, ip_rotations_total
log = logging.getLogger("rightmove")
_ua = UserAgent(
browsers=["Chrome", "Edge"], os=["Windows", "Mac OS X"], min_version=120.0
)
def _endpoint_label(url: str) -> str:
if "typeahead" in url:
return "typeahead"
if "search" in url:
return "search"
return "other"
def _status_label(code: int) -> str:
if code >= 500:
return "5xx"
return str(code)
# Gluetun control API — runs on port 8000 inside the gluetun container.
# Since finder uses network_mode: service:gluetun, localhost IS gluetun.
GLUETUN_API = "http://127.0.0.1:8000"
_ip_rotate_lock = threading.Lock()
def rotate_ip() -> bool:
"""Ask gluetun to reconnect to a different VPN server, getting a new IP.
Returns True if the IP changed successfully."""
with _ip_rotate_lock:
log.info("Rotating VPN IP via gluetun...")
try:
# Get current IP
with httpx.Client(timeout=10) as ctl:
old_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
old_ip = (
old_ip_resp.json().get("public_ip", "unknown")
if old_ip_resp.status_code == 200
else "unknown"
)
log.info("Current IP: %s", old_ip)
# Trigger server change — PUT with empty JSON body picks a random server
resp = ctl.put(
f"{GLUETUN_API}/v1/vpn/status", json={"status": "stopped"}
)
if resp.status_code != 200:
log.error("Failed to stop VPN: %d %s", resp.status_code, resp.text)
return False
time.sleep(2)
resp = ctl.put(
f"{GLUETUN_API}/v1/vpn/status", json={"status": "running"}
)
if resp.status_code != 200:
log.error("Failed to start VPN: %d %s", resp.status_code, resp.text)
return False
# Wait for reconnection
for _ in range(30):
time.sleep(2)
try:
with httpx.Client(timeout=10) as ctl:
new_ip_resp = ctl.get(f"{GLUETUN_API}/v1/publicip/ip")
if new_ip_resp.status_code == 200:
new_ip = new_ip_resp.json().get("public_ip", "")
if new_ip and new_ip != old_ip:
log.info("IP rotated: %s%s", old_ip, new_ip)
ip_rotations_total.labels(result="success").inc()
return True
except Exception:
pass # VPN still reconnecting
log.warning("IP rotation timed out (may still be same IP)")
ip_rotations_total.labels(result="failure").inc()
return False
except Exception as e:
log.error("IP rotation failed: %s", e)
ip_rotations_total.labels(result="failure").inc()
return False
def make_client() -> httpx.Client:
return httpx.Client(
timeout=30,
headers={"User-Agent": _ua.random, "Accept": "application/json"},
follow_redirects=True,
)
def fetch_with_retry(
client: httpx.Client, url: str, params: dict | None = None, on_403: bool = True
) -> dict | None:
"""GET JSON with retries on 429/5xx/connection errors. Returns None on permanent failure.
On 403, triggers IP rotation and retries once."""
endpoint = _endpoint_label(url)
for attempt in range(MAX_RETRIES):
try:
resp = client.get(url, params=params)
http_requests_total.labels(
status=_status_label(resp.status_code), endpoint=endpoint
).inc()
if resp.status_code == 200:
return resp.json()
if resp.status_code == 403 and on_403:
log.warning("HTTP 403 — IP likely blocked, rotating...")
if rotate_ip():
# Retry once with new IP (but don't recurse on 403 again)
return fetch_with_retry(client, url, params, on_403=False)
log.error("IP rotation failed, giving up on %s", url)
return None
if resp.status_code in (429, 500, 502, 503, 504):
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"HTTP %d from %s, retry %d/%d in %.1fs",
resp.status_code,
url,
attempt + 1,
MAX_RETRIES,
delay,
)
time.sleep(delay)
continue
log.error("HTTP %d from %s (non-retryable)", resp.status_code, url)
return None
except (
httpx.ConnectError,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
) as e:
http_errors_total.labels(type=type(e).__name__).inc()
delay = RETRY_BASE_DELAY * (2**attempt) + random.uniform(0, 1)
log.warning(
"%s from %s, retry %d/%d in %.1fs",
type(e).__name__,
url,
attempt + 1,
MAX_RETRIES,
delay,
)
time.sleep(delay)
http_errors_total.labels(type="retry_exhausted").inc()
log.error("All %d retries exhausted for %s", MAX_RETRIES, url)
return None