Improve scraping

This commit is contained in:
Andras Schmelczer 2026-03-29 11:41:35 +01:00
parent 89a85e9a0c
commit c14d28f430
7 changed files with 91 additions and 25 deletions

View file

@ -338,7 +338,25 @@ def _load_checkpoint(
if rpath.exists():
try:
with open(rpath) as f:
loaded_results[source][channel.upper()] = json.load(f)
raw = json.load(f)
# Deduplicate by ID — concurrent workers (e.g. hk_worker's
# ThreadPoolExecutor) can cause in-flight outcodes to have
# results saved before their progress index is recorded.
# On resume those outcodes get re-scraped, duplicating results.
seen_ids: set[str] = set()
deduped: list[dict] = []
for p in raw:
pid = p.get("id")
if pid not in seen_ids:
seen_ids.add(pid)
deduped.append(p)
if len(deduped) < len(raw):
log.info(
"Checkpoint %s/%s: deduped %d%d (removed %d dupes)",
source, channel, len(raw), len(deduped),
len(raw) - len(deduped),
)
loaded_results[source][channel.upper()] = deduped
except Exception:
log.warning(
"Checkpoint results for %s/%s corrupt, restarting %s",