112 lines
4.7 KiB
Java
112 lines
4.7 KiB
Java
package propertymap;
|
|
|
|
import org.duckdb.DuckDBAppender;
|
|
import org.duckdb.DuckDBConnection;
|
|
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.nio.file.StandardCopyOption;
|
|
import java.sql.DriverManager;
|
|
import java.sql.ResultSet;
|
|
import java.sql.Statement;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
/** DuckDB-based parquet I/O. */
|
|
public class Parquet {
|
|
|
|
record Postcodes(String[] codes, double[] lats, double[] lons) {}
|
|
|
|
static {
|
|
try { Class.forName("org.duckdb.DuckDBDriver"); }
|
|
catch (ClassNotFoundException e) { throw new RuntimeException(e); }
|
|
}
|
|
|
|
/** Load England postcodes, write reference parquet, return codes + flat lat/lon arrays. */
|
|
static Postcodes loadEnglandPostcodes(String parquetPath, Path refOut) throws Exception {
|
|
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
|
|
stmt.execute("CREATE TABLE postcodes AS SELECT pcds, lat, \"long\" FROM read_parquet('"
|
|
+ parquetPath + "') WHERE ctry = 'E92000001'");
|
|
copyToParquet(stmt, "SELECT * FROM postcodes", refOut);
|
|
|
|
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM postcodes")) {
|
|
rs.next();
|
|
int n = rs.getInt(1);
|
|
String[] codes = new String[n];
|
|
double[] lats = new double[n];
|
|
double[] lons = new double[n];
|
|
|
|
try (ResultSet data = stmt.executeQuery("SELECT pcds, lat, \"long\" FROM postcodes")) {
|
|
int i = 0;
|
|
while (data.next()) {
|
|
codes[i] = data.getString(1);
|
|
lats[i] = data.getDouble(2);
|
|
lons[i] = data.getDouble(3);
|
|
i++;
|
|
}
|
|
}
|
|
return new Postcodes(codes, lats, lons);
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Load places deduplicated by lat/lon, write reference parquet, return flat lat/lon arrays. */
|
|
static double[][] loadPlaces(String parquetPath, Path refOut) throws Exception {
|
|
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
|
|
stmt.execute("CREATE TABLE places AS SELECT * EXCLUDE (rn) FROM ("
|
|
+ "SELECT *, ROW_NUMBER() OVER (PARTITION BY lat, lon) AS rn "
|
|
+ "FROM read_parquet('" + parquetPath + "')) WHERE rn = 1");
|
|
copyToParquet(stmt, "SELECT * FROM places", refOut);
|
|
|
|
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM places")) {
|
|
rs.next();
|
|
int n = rs.getInt(1);
|
|
// Return as [lats, lons] flat arrays
|
|
double[] lats = new double[n];
|
|
double[] lons = new double[n];
|
|
|
|
try (ResultSet data = stmt.executeQuery("SELECT lat, lon FROM places")) {
|
|
int i = 0;
|
|
while (data.next()) {
|
|
lats[i] = data.getDouble(1);
|
|
lons[i] = data.getDouble(2);
|
|
i++;
|
|
}
|
|
}
|
|
return new double[][]{lats, lons};
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Write postcode travel times as a ZSTD-compressed parquet (atomic via tmp + rename). */
|
|
static void writeTravelTimes(DuckDBConnection conn, Path outPath, String[] postcodes, short[] times)
|
|
throws Exception {
|
|
Path tmp = outPath.resolveSibling(outPath.getFileName() + ".tmp");
|
|
try (Statement stmt = conn.createStatement()) {
|
|
stmt.execute("DROP TABLE IF EXISTS t");
|
|
stmt.execute("CREATE TABLE t (pcds VARCHAR, travel_minutes SMALLINT)");
|
|
}
|
|
try (DuckDBAppender appender = conn.createAppender("main", "t")) {
|
|
for (int i = 0; i < postcodes.length; i++) {
|
|
appender.beginRow();
|
|
appender.append(postcodes[i]);
|
|
appender.append(times[i]);
|
|
appender.endRow();
|
|
}
|
|
}
|
|
try (Statement stmt = conn.createStatement()) {
|
|
stmt.execute("COPY t TO '" + tmp.toAbsolutePath() + "' (FORMAT PARQUET, COMPRESSION ZSTD)");
|
|
}
|
|
Files.move(tmp, outPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
|
|
}
|
|
|
|
/** Create a new in-memory DuckDB connection (for use as a per-thread reusable connection). */
|
|
static DuckDBConnection connect() throws Exception {
|
|
return (DuckDBConnection) DriverManager.getConnection("jdbc:duckdb:");
|
|
}
|
|
|
|
private static void copyToParquet(Statement stmt, String query, Path outPath) throws Exception {
|
|
stmt.execute("COPY (" + query + ") TO '" + outPath.toAbsolutePath()
|
|
+ "' (FORMAT PARQUET, COMPRESSION ZSTD)");
|
|
}
|
|
}
|