Add more data & fix ooms

This commit is contained in:
Andras Schmelczer 2026-01-31 14:39:46 +00:00
parent f60fbec9d4
commit a8cc44ea97
8 changed files with 242 additions and 82 deletions

View file

@ -576,7 +576,7 @@ def transform(input_path: Path) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
all_categories = lf.select("category").unique().collect().to_series().to_list()
all_categories = lf.select("category").unique().collect(engine="streaming").to_series().to_list()
# Verify every non-dropped category has a mapping
unmapped = []
@ -632,7 +632,7 @@ def main():
)
args = parser.parse_args()
df = transform(args.input).collect()
df = transform(args.input).collect(engine="streaming")
df.write_parquet(args.output)