Extarct utils

This commit is contained in:
Andras Schmelczer 2026-01-31 10:49:43 +00:00
parent 0153e46478
commit e1b38a1b95
8 changed files with 458 additions and 25 deletions

View file

@ -8,6 +8,7 @@ from tqdm import tqdm
from .config import DESTINATIONS, MAX_CONCURRENT, MAX_POSTCODES, OUTPUT_DIR, MAX_DISTANCE_KM
from .results import CheckpointSaver, results_to_dataframe, save_results
from .tfl_client import fetch_journey_times
from pipeline.utils import haversine_km_expr
def main():
@ -28,31 +29,9 @@ def main():
postcodes_df = pl.read_parquet(OUTPUT_DIR / "postcodes_h3.parquet")
print(f"Loaded {postcodes_df.height:,} postcodes")
# Filter to postcodes within 150km of destination using Haversine formula
earth_radius_km = 6371
dest_lat_rad = destination.lat * 3.14159265359 / 180
dest_lon_rad = destination.lon * 3.14159265359 / 180
# Filter to postcodes within range of destination
postcodes_df = postcodes_df.with_columns(
(
2
* earth_radius_km
* (
(
((pl.lit(dest_lat_rad) - pl.col("lat") * 3.14159265359 / 180) / 2).sin()
** 2
+ pl.lit(dest_lat_rad).cos()
* (pl.col("lat") * 3.14159265359 / 180).cos()
* (
(pl.lit(dest_lon_rad) - pl.col("long") * 3.14159265359 / 180) / 2
).sin()
** 2
)
.sqrt()
.arcsin()
)
).alias("distance_km")
haversine_km_expr("lat", "long", destination.lat, destination.lon).alias("distance_km")
).filter(pl.col("distance_km") <= MAX_DISTANCE_KM)
print(f"Filtered to {postcodes_df.height:,} postcodes within {MAX_DISTANCE_KM}km")