Improve scraping
This commit is contained in:
parent
89a85e9a0c
commit
c14d28f430
7 changed files with 91 additions and 25 deletions
|
|
@ -338,7 +338,25 @@ def _load_checkpoint(
|
|||
if rpath.exists():
|
||||
try:
|
||||
with open(rpath) as f:
|
||||
loaded_results[source][channel.upper()] = json.load(f)
|
||||
raw = json.load(f)
|
||||
# Deduplicate by ID — concurrent workers (e.g. hk_worker's
|
||||
# ThreadPoolExecutor) can cause in-flight outcodes to have
|
||||
# results saved before their progress index is recorded.
|
||||
# On resume those outcodes get re-scraped, duplicating results.
|
||||
seen_ids: set[str] = set()
|
||||
deduped: list[dict] = []
|
||||
for p in raw:
|
||||
pid = p.get("id")
|
||||
if pid not in seen_ids:
|
||||
seen_ids.add(pid)
|
||||
deduped.append(p)
|
||||
if len(deduped) < len(raw):
|
||||
log.info(
|
||||
"Checkpoint %s/%s: deduped %d → %d (removed %d dupes)",
|
||||
source, channel, len(raw), len(deduped),
|
||||
len(raw) - len(deduped),
|
||||
)
|
||||
loaded_results[source][channel.upper()] = deduped
|
||||
except Exception:
|
||||
log.warning(
|
||||
"Checkpoint results for %s/%s corrupt, restarting %s",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue