166 lines
7.5 KiB
Java
166 lines
7.5 KiB
Java
package propertymap;
|
|
|
|
import org.duckdb.DuckDBAppender;
|
|
import org.duckdb.DuckDBConnection;
|
|
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.nio.file.StandardCopyOption;
|
|
import java.sql.DriverManager;
|
|
import java.sql.ResultSet;
|
|
import java.sql.Statement;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
/** DuckDB-based parquet I/O. */
|
|
public class Parquet {
|
|
|
|
record Postcodes(String[] codes, double[] lats, double[] lons) {}
|
|
record Places(String[] names, double[] lats, double[] lons) {}
|
|
|
|
static {
|
|
try { Class.forName("org.duckdb.DuckDBDriver"); }
|
|
catch (ClassNotFoundException e) { throw new RuntimeException(e); }
|
|
}
|
|
|
|
/** Escape a file path for safe interpolation into DuckDB SQL (double single quotes). */
|
|
private static String escapePath(String path) {
|
|
return path.replace("'", "''");
|
|
}
|
|
|
|
/** Load England postcodes, write reference parquet, return codes + flat lat/lon arrays. */
|
|
static Postcodes loadEnglandPostcodes(String parquetPath, Path refOut) throws Exception {
|
|
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
|
|
stmt.execute("CREATE TABLE postcodes AS SELECT pcds, lat, \"long\" FROM read_parquet('"
|
|
+ escapePath(parquetPath) + "') WHERE ctry25cd = 'E92000001' AND doterm IS NULL");
|
|
copyToParquet(stmt, "SELECT * FROM postcodes", refOut);
|
|
|
|
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM postcodes")) {
|
|
rs.next();
|
|
int n = rs.getInt(1);
|
|
String[] codes = new String[n];
|
|
double[] lats = new double[n];
|
|
double[] lons = new double[n];
|
|
|
|
try (ResultSet data = stmt.executeQuery("SELECT pcds, lat, \"long\" FROM postcodes")) {
|
|
int i = 0;
|
|
while (data.next()) {
|
|
codes[i] = data.getString(1);
|
|
lats[i] = data.getDouble(2);
|
|
lons[i] = data.getDouble(3);
|
|
i++;
|
|
}
|
|
}
|
|
return new Postcodes(codes, lats, lons);
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Load places deduplicated by lat/lon, write reference parquet, return names + flat lat/lon arrays. */
|
|
static Places loadPlaces(String parquetPath, Path refOut) throws Exception {
|
|
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
|
|
stmt.execute("CREATE TABLE all_places AS SELECT * FROM read_parquet('"
|
|
+ escapePath(parquetPath) + "')");
|
|
boolean hasTravelDestination = tableHasColumn(stmt, "all_places", "travel_destination");
|
|
String source = hasTravelDestination
|
|
? "(SELECT * FROM all_places WHERE COALESCE(travel_destination, true))"
|
|
: "all_places";
|
|
stmt.execute("CREATE TABLE places AS SELECT * EXCLUDE (rn) FROM ("
|
|
+ "SELECT *, ROW_NUMBER() OVER (PARTITION BY lat, lon) AS rn "
|
|
+ "FROM " + source + " AS p) WHERE rn = 1");
|
|
copyToParquet(stmt, "SELECT * FROM places", refOut);
|
|
|
|
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM places")) {
|
|
rs.next();
|
|
int n = rs.getInt(1);
|
|
String[] names = new String[n];
|
|
double[] lats = new double[n];
|
|
double[] lons = new double[n];
|
|
|
|
try (ResultSet data = stmt.executeQuery("SELECT name, lat, lon FROM places")) {
|
|
int i = 0;
|
|
while (data.next()) {
|
|
names[i] = data.getString(1);
|
|
lats[i] = data.getDouble(2);
|
|
lons[i] = data.getDouble(3);
|
|
i++;
|
|
}
|
|
}
|
|
return new Places(names, lats, lons);
|
|
}
|
|
}
|
|
}
|
|
|
|
private static boolean tableHasColumn(Statement stmt, String tableName, String columnName)
|
|
throws Exception {
|
|
try (ResultSet rs = stmt.executeQuery(
|
|
"SELECT COUNT(*) FROM information_schema.columns "
|
|
+ "WHERE table_name = '" + tableName + "' "
|
|
+ "AND column_name = '" + columnName + "'")) {
|
|
rs.next();
|
|
return rs.getInt(1) > 0;
|
|
}
|
|
}
|
|
|
|
/** Write postcode travel times as a ZSTD-compressed parquet (atomic via tmp + rename). */
|
|
static void writeTravelTimes(DuckDBConnection conn, Path outPath, String[] postcodes, short[] times)
|
|
throws Exception {
|
|
Path tmp = outPath.resolveSibling(outPath.getFileName() + ".tmp");
|
|
try (Statement stmt = conn.createStatement()) {
|
|
stmt.execute("DROP TABLE IF EXISTS t");
|
|
stmt.execute("CREATE TABLE t (pcds VARCHAR, travel_minutes SMALLINT)");
|
|
}
|
|
try (DuckDBAppender appender = conn.createAppender("main", "t")) {
|
|
for (int i = 0; i < postcodes.length; i++) {
|
|
appender.beginRow();
|
|
appender.append(postcodes[i]);
|
|
appender.append(times[i]);
|
|
appender.endRow();
|
|
}
|
|
}
|
|
try (Statement stmt = conn.createStatement()) {
|
|
stmt.execute("COPY t TO '" + escapePath(tmp.toAbsolutePath().toString()) + "' (FORMAT PARQUET, COMPRESSION ZSTD)");
|
|
}
|
|
Files.move(tmp, outPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
|
|
}
|
|
|
|
/**
|
|
* Write transit travel times with median, best-case, and optional journey columns.
|
|
* @param journeys may be null (no journey column written) or non-null (journey VARCHAR added, individual elements may be null)
|
|
*/
|
|
static void writeTransitTravelTimes(DuckDBConnection conn, Path outPath,
|
|
String[] postcodes, short[] times, short[] bestTimes, String[] journeys) throws Exception {
|
|
Path tmp = outPath.resolveSibling(outPath.getFileName() + ".tmp");
|
|
boolean hasJourneys = journeys != null;
|
|
try (Statement stmt = conn.createStatement()) {
|
|
stmt.execute("DROP TABLE IF EXISTS t");
|
|
stmt.execute(hasJourneys
|
|
? "CREATE TABLE t (pcds VARCHAR, travel_minutes SMALLINT, best_minutes SMALLINT, journey VARCHAR)"
|
|
: "CREATE TABLE t (pcds VARCHAR, travel_minutes SMALLINT, best_minutes SMALLINT)");
|
|
}
|
|
try (DuckDBAppender appender = conn.createAppender("main", "t")) {
|
|
for (int i = 0; i < postcodes.length; i++) {
|
|
appender.beginRow();
|
|
appender.append(postcodes[i]);
|
|
appender.append(times[i]);
|
|
appender.append(bestTimes[i]);
|
|
if (hasJourneys) appender.append(journeys[i]); // null-safe: DuckDB appends SQL NULL
|
|
appender.endRow();
|
|
}
|
|
}
|
|
try (Statement stmt = conn.createStatement()) {
|
|
stmt.execute("COPY t TO '" + escapePath(tmp.toAbsolutePath().toString()) + "' (FORMAT PARQUET, COMPRESSION ZSTD)");
|
|
}
|
|
Files.move(tmp, outPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
|
|
}
|
|
|
|
/** Create a new in-memory DuckDB connection (for use as a per-thread reusable connection). */
|
|
static DuckDBConnection connect() throws Exception {
|
|
return (DuckDBConnection) DriverManager.getConnection("jdbc:duckdb:");
|
|
}
|
|
|
|
private static void copyToParquet(Statement stmt, String query, Path outPath) throws Exception {
|
|
stmt.execute("COPY (" + query + ") TO '" + escapePath(outPath.toAbsolutePath().toString())
|
|
+ "' (FORMAT PARQUET, COMPRESSION ZSTD)");
|
|
}
|
|
}
|