Add more data & fix ooms
This commit is contained in:
parent
f60fbec9d4
commit
a8cc44ea97
8 changed files with 242 additions and 82 deletions
|
|
@ -60,7 +60,7 @@ def fuzzy_join_on_postcode(
|
|||
.str.to_uppercase()
|
||||
.alias("_left_postcode"),
|
||||
)
|
||||
.collect()
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
right_match = (
|
||||
|
|
@ -74,7 +74,7 @@ def fuzzy_join_on_postcode(
|
|||
.alias("_right_postcode"),
|
||||
)
|
||||
.unique(subset=["_right_address", "_right_postcode"], keep="first")
|
||||
.collect()
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
# Group right side by postcode for fast lookup
|
||||
|
|
|
|||
|
|
@ -157,31 +157,24 @@ def count_pois_within_radius(
|
|||
# Count POIs per postcode
|
||||
postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
|
||||
|
||||
# Write to temp file to avoid memory duplication during join
|
||||
print(" Writing postcode counts to temp file...")
|
||||
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
||||
with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp:
|
||||
tmp_path = tmp.name
|
||||
postcode_counts.write_parquet(tmp_path)
|
||||
|
||||
del postcode_counts # Free memory
|
||||
# Join using lazy evaluation
|
||||
print(" Joining counts back to properties (lazy)...")
|
||||
count_cols = [f"{group}_{int(radius_km)}km" for group in POI_GROUPS]
|
||||
|
||||
# Join using lazy evaluation
|
||||
print(" Joining counts back to properties (lazy)...")
|
||||
count_cols = [f"{group}_{int(radius_km)}km" for group in POI_GROUPS]
|
||||
# Convert properties to lazy frame, join, then collect
|
||||
result_lazy = (
|
||||
properties.lazy()
|
||||
.select("postcode")
|
||||
.join(pl.scan_parquet(tmp_path), on="postcode", how="left")
|
||||
.select(count_cols)
|
||||
.fill_null(0)
|
||||
)
|
||||
|
||||
# Convert properties to lazy frame, join, then collect
|
||||
result_lazy = (
|
||||
properties.lazy()
|
||||
.select("postcode")
|
||||
.join(pl.scan_parquet(tmp_path), on="postcode", how="left")
|
||||
.select(count_cols)
|
||||
.fill_null(0)
|
||||
)
|
||||
result_df = result_lazy.collect(engine="streaming")
|
||||
|
||||
result_df = result_lazy.collect()
|
||||
|
||||
# Clean up temp file
|
||||
os.unlink(tmp_path)
|
||||
|
||||
# Extract as dict of Series
|
||||
return {col: result_df[col] for col in count_cols}
|
||||
return {col: result_df[col] for col in count_cols}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue