perfect-postcode/r5-java/src/main/java/propertymap/App.java
2026-05-13 12:11:54 +01:00

378 lines
16 KiB
Java

package propertymap;
import com.conveyal.r5.transit.TransportNetwork;
import org.duckdb.DuckDBConnection;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Batch-compute travel times from each origin (place) to nearby postcodes
* for all transport modes (car, bicycle, walking, transit).
*
* Each origin is spatially pre-filtered to only route to postcodes within a
* plausible travel radius for the mode. Output is sparse: only reachable
* postcodes are written (unreachable = absent from file).
*
* Output per mode: one parquet file per origin in {output-dir}/{mode}/{name}.parquet
* with columns (pcds VARCHAR, travel_minutes SMALLINT). Transit mode additionally
* includes a best_minutes SMALLINT column (5th percentile = best-case departure timing)
* and a journey VARCHAR column with JSON leg instructions.
*/
public class App {
private static final String[] MODES = {"bicycle", "transit", "walking", "car"};
private static final String[] DEMO_MODES = {"transit"};
private static final Set<String> DEMO_PLACES = Set.of(
"Bank tube station", "Tottenham Court Road tube station");
private static final int MAX_RETRIES = 2;
public static void main(String[] args) throws Exception {
String postcodesPath = requiredArg(args, "--postcodes");
String placesPath = requiredArg(args, "--places");
String outputDirStr = requiredArg(args, "--output-dir");
int threads = Integer.parseInt(optionalArg(args, "--threads", "4"));
boolean enablePaths = true;
boolean demo = hasFlag(args, "--demo");
Path outDir = Paths.get(outputDirStr);
Files.createDirectories(outDir);
LocalDate today = LocalDate.now();
TransportNetwork network = Router.loadNetwork(requiredEnv("DATA_DIR"), requiredEnv("NETWORK_CACHE_DIR"));
Router.validateTransitServices(network, today);
System.err.println("Loading postcodes (England only)...");
Parquet.Postcodes postcodes = Parquet.loadEnglandPostcodes(
postcodesPath, outDir.resolve("postcodes_ref.parquet"));
System.err.printf(" %,d postcodes%n", postcodes.lats().length);
System.err.println("Loading places (deduplicated)...");
Parquet.Places places = Parquet.loadPlaces(placesPath, outDir.resolve("places_ref.parquet"));
String[] originNames = places.names();
double[] originLats = places.lats(), originLons = places.lons();
int nOrigins = originLats.length;
System.err.printf(" %,d travel-eligible places%n", nOrigins);
// Filter places to England only (must be near at least one England postcode)
Set<Integer> englandIndices = filterEnglandPlaces(
originLats, originLons, postcodes.lats(), postcodes.lons());
int excluded = nOrigins - englandIndices.size();
if (excluded > 0) {
System.err.printf(" %,d places excluded (non-England), %,d remaining%n",
excluded, englandIndices.size());
}
// In demo mode, filter to just Bank + TCR and transit only
int[] originIndices;
String[] modes;
if (demo) {
List<Integer> demoIdx = new ArrayList<>();
for (int i = 0; i < nOrigins; i++) {
if (DEMO_PLACES.contains(originNames[i])) demoIdx.add(i);
}
originIndices = demoIdx.stream().mapToInt(Integer::intValue).toArray();
modes = DEMO_MODES;
System.err.printf("DEMO MODE: %d places (transit only)%n", originIndices.length);
for (int i : originIndices) System.err.printf(" - %s%n", originNames[i]);
} else {
// Normal mode: use all travel-eligible England places
originIndices = englandIndices.stream().sorted()
.mapToInt(Integer::intValue).toArray();
modes = MODES;
}
// One thread pool shared across all modes
ExecutorService pool = Executors.newFixedThreadPool(threads);
// One DuckDB connection per thread, reused across all writes
ThreadLocal<DuckDBConnection> threadConn = ThreadLocal.withInitial(() -> {
try { return Parquet.connect(); }
catch (Exception e) { throw new RuntimeException(e); }
});
if (enablePaths) System.err.println("Path recording ENABLED (transit only, ~20x slower)");
try {
for (String mode : modes) {
processMode(network, postcodes.codes(), postcodes.lats(), postcodes.lons(),
originNames, originLats, originLons, outDir, mode, today, pool, threadConn, enablePaths,
originIndices, !demo);
}
} finally {
pool.shutdown();
pool.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
}
}
/**
* @param originIndices origin indices to process.
* @param skipCompleted if true, skip origins that already have output files.
*/
private static void processMode(
TransportNetwork network,
String[] postcodes, double[] postcodeLats, double[] postcodeLons,
String[] originNames, double[] originLats, double[] originLons,
Path outDir, String mode, LocalDate date,
ExecutorService pool, ThreadLocal<DuckDBConnection> threadConn,
boolean enablePaths, int[] originIndices, boolean skipCompleted) throws Exception {
System.err.printf("%n=== %s ===%n", mode.toUpperCase());
System.err.printf(" Radius: %.0f km%n", Router.maxRadiusKm(mode));
Path modeDir = outDir.resolve(mode);
Files.createDirectories(modeDir);
// Scan existing slugs once (O(directory)) instead of per-origin stat calls.
// This matches by slug regardless of numeric prefix, so re-indexed places.parquet
// won't cause duplicate computation.
Set<String> existingSlugs = skipCompleted ? scanExistingSlugs(modeDir) : Set.of();
List<Integer> remaining = new ArrayList<>();
for (int idx : originIndices) {
if (skipCompleted && existingSlugs.contains(slugFromName(originNames[idx]))) {
continue;
}
remaining.add(idx);
}
int alreadyDone = originIndices.length - remaining.size();
System.err.printf(" %,d done, %,d remaining%n", alreadyDone, remaining.size());
if (remaining.isEmpty()) {
System.err.println(" All origins completed for this mode!");
return;
}
long startMs = System.currentTimeMillis();
int total = remaining.size();
AtomicInteger completed = new AtomicInteger(0);
AtomicInteger failed = new AtomicInteger(0);
// Progress reporter on a timer instead of per-task stderr writes
ScheduledExecutorService reporter = Executors.newSingleThreadScheduledExecutor(r -> {
Thread t = new Thread(r, "progress");
t.setDaemon(true);
return t;
});
reporter.scheduleAtFixedRate(() -> {
int c = completed.get();
if (c == 0) return;
double secs = (System.currentTimeMillis() - startMs) / 1000.0;
double rate = c / secs;
double etaH = (total - c) / rate / 3600;
System.err.printf("\r [%,d/%,d] %.1f/s | ETA %.1fh | fail %d",
c, total, rate, etaH, failed.get());
}, 2, 2, TimeUnit.SECONDS);
CountDownLatch latch = new CountDownLatch(remaining.size());
for (int idx : remaining) {
pool.submit(() -> {
try {
processOrigin(network, postcodes, postcodeLats, postcodeLons,
originLats[idx], originLons[idx],
modeDir, mode, date, idx, originNames[idx], threadConn.get(), enablePaths);
completed.incrementAndGet();
} catch (Exception e) {
failed.incrementAndGet();
System.err.printf("%n [FAIL] origin %s: %s%n", originNames[idx], e.getMessage());
} finally {
latch.countDown();
}
});
}
latch.await();
reporter.shutdown();
reporter.awaitTermination(5, TimeUnit.SECONDS);
double elapsedH = (System.currentTimeMillis() - startMs) / 3_600_000.0;
int n = completed.get();
System.err.printf("\r [%,d/%,d] %.1f/s | %.1fh | fail %d%n",
n, total, n / Math.max(elapsedH * 3600, 1), elapsedH, failed.get());
int failures = failed.get();
if (failures > 0) {
throw new IllegalStateException(String.format(
"%s travel-time generation failed for %,d of %,d origins; outputs are incomplete",
mode, failures, total));
}
}
/** Compute and write travel times for a single origin, with retry on failure. */
private static void processOrigin(
TransportNetwork network,
String[] postcodes, double[] postcodeLats, double[] postcodeLons,
double originLat, double originLon,
Path modeDir, String mode, LocalDate date, int index, String name,
DuckDBConnection conn, boolean enablePaths) throws Exception {
Path outPath = modeDir.resolve(originFilename(index, name));
Exception lastError = null;
for (int attempt = 0; attempt <= MAX_RETRIES; attempt++) {
try {
Router.FilteredResult result = Router.computeForOrigin(
network, postcodeLats, postcodeLons,
originLat, originLon, mode, date, enablePaths);
// Write only reachable postcodes (sparse output)
int reachable = 0;
for (short t : result.times()) if (t >= 0) reachable++;
String[] codes = new String[reachable];
short[] times = new short[reachable];
short[] bestTimes = result.bestTimes() != null ? new short[reachable] : null;
String[] journeys = result.journeys() != null ? new String[reachable] : null;
int j = 0;
for (int i = 0; i < result.times().length; i++) {
if (result.times()[i] >= 0) {
codes[j] = postcodes[result.originalIndices()[i]];
times[j] = result.times()[i];
if (bestTimes != null) bestTimes[j] = result.bestTimes()[i];
if (journeys != null) journeys[j] = result.journeys()[i]; // may be null for some postcodes
j++;
}
}
if (bestTimes != null) {
Parquet.writeTransitTravelTimes(conn, outPath, codes, times, bestTimes, journeys);
} else {
Parquet.writeTravelTimes(conn, outPath, codes, times);
}
return;
} catch (Exception e) {
lastError = e;
if (attempt < MAX_RETRIES) {
System.err.printf("%n [RETRY %d/%d] %s: %s%n",
attempt + 1, MAX_RETRIES, name, e.getMessage());
} else {
System.err.printf("%n [FAIL TRACE] %s:%n", name);
e.printStackTrace(System.err);
}
}
}
throw lastError;
}
/** Build a filename from index + place name (index prefix prevents collisions after sanitization). */
private static String originFilename(int index, String name) {
return String.format("%06d-%s.parquet", index, slugFromName(name));
}
/** Slugify a place name: lowercase, strip non-alphanumeric (except spaces/hyphens), collapse whitespace. */
private static String slugFromName(String name) {
return name.toLowerCase()
.replaceAll("[^a-z0-9 -]", "")
.replaceAll("\\s+", "-");
}
/**
* Scan a mode directory for existing non-empty parquet files, returning the set of slugs
* (filenames with numeric prefix stripped). This allows resume to work across places.parquet
* rebuilds where indices change but slugs stay the same.
*/
private static Set<String> scanExistingSlugs(Path modeDir) throws IOException {
Set<String> slugs = new HashSet<>();
if (!Files.isDirectory(modeDir)) return slugs;
try (DirectoryStream<Path> stream = Files.newDirectoryStream(modeDir, "*.parquet")) {
for (Path p : stream) {
if (Files.size(p) > 0) {
String stem = p.getFileName().toString().replace(".parquet", "");
int dash = stem.indexOf('-');
if (dash > 0 && stem.substring(0, dash).chars().allMatch(Character::isDigit)) {
slugs.add(stem.substring(dash + 1));
} else {
slugs.add(stem);
}
}
}
}
return slugs;
}
private static String requiredArg(String[] args, String name) {
for (int i = 0; i < args.length - 1; i++) {
if (args[i].equals(name)) return args[i + 1];
}
System.err.println("Missing required argument: " + name);
System.err.println("Usage: App --postcodes FILE --places FILE --output-dir DIR [--threads N] [--demo]");
System.exit(1);
return null; // unreachable
}
private static boolean hasFlag(String[] args, String name) {
for (String arg : args) {
if (arg.equals(name)) return true;
}
return false;
}
private static String optionalArg(String[] args, String name, String defaultValue) {
for (int i = 0; i < args.length - 1; i++) {
if (args[i].equals(name)) return args[i + 1];
}
return defaultValue;
}
/**
* Filter place indices to those near at least one England postcode.
* Uses a 0.1° grid (~11km cells) built from postcode locations — a place is kept
* if its grid cell or any adjacent cell contains an England postcode.
*/
private static Set<Integer> filterEnglandPlaces(
double[] placeLats, double[] placeLons,
double[] postcodeLats, double[] postcodeLons) {
// Build grid of cells that contain at least one England postcode
Set<Long> postcodeCells = new HashSet<>();
for (int i = 0; i < postcodeLats.length; i++) {
postcodeCells.add(gridCell(postcodeLats[i], postcodeLons[i]));
}
// Keep places where the place's cell or any adjacent cell has postcodes
Set<Integer> keep = new HashSet<>();
for (int i = 0; i < placeLats.length; i++) {
int gLat = (int) Math.floor(placeLats[i] * 10);
int gLon = (int) Math.floor(placeLons[i] * 10);
outer:
for (int dLat = -1; dLat <= 1; dLat++) {
for (int dLon = -1; dLon <= 1; dLon++) {
if (postcodeCells.contains(packGrid(gLat + dLat, gLon + dLon))) {
keep.add(i);
break outer;
}
}
}
}
return keep;
}
private static long gridCell(double lat, double lon) {
return packGrid((int) Math.floor(lat * 10), (int) Math.floor(lon * 10));
}
private static long packGrid(int gLat, int gLon) {
return ((long) gLat << 32) | (gLon & 0xFFFFFFFFL);
}
private static String requiredEnv(String name) {
String val = System.getenv(name);
if (val == null) {
System.err.println("Missing required environment variable: " + name);
System.exit(1);
}
return val;
}
}