This commit is contained in:
Andras Schmelczer 2026-02-15 09:48:30 +00:00
parent 128b3191e7
commit 03445188ea
54 changed files with 596953 additions and 3577 deletions

View file

@ -0,0 +1,112 @@
package propertymap;
import org.duckdb.DuckDBAppender;
import org.duckdb.DuckDBConnection;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
/** DuckDB-based parquet I/O. */
public class Parquet {
record Postcodes(String[] codes, double[] lats, double[] lons) {}
static {
try { Class.forName("org.duckdb.DuckDBDriver"); }
catch (ClassNotFoundException e) { throw new RuntimeException(e); }
}
/** Load England postcodes, write reference parquet, return codes + flat lat/lon arrays. */
static Postcodes loadEnglandPostcodes(String parquetPath, Path refOut) throws Exception {
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
stmt.execute("CREATE TABLE postcodes AS SELECT pcds, lat, \"long\" FROM read_parquet('"
+ parquetPath + "') WHERE ctry = 'E92000001'");
copyToParquet(stmt, "SELECT * FROM postcodes", refOut);
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM postcodes")) {
rs.next();
int n = rs.getInt(1);
String[] codes = new String[n];
double[] lats = new double[n];
double[] lons = new double[n];
try (ResultSet data = stmt.executeQuery("SELECT pcds, lat, \"long\" FROM postcodes")) {
int i = 0;
while (data.next()) {
codes[i] = data.getString(1);
lats[i] = data.getDouble(2);
lons[i] = data.getDouble(3);
i++;
}
}
return new Postcodes(codes, lats, lons);
}
}
}
/** Load places deduplicated by lat/lon, write reference parquet, return flat lat/lon arrays. */
static double[][] loadPlaces(String parquetPath, Path refOut) throws Exception {
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
stmt.execute("CREATE TABLE places AS SELECT * EXCLUDE (rn) FROM ("
+ "SELECT *, ROW_NUMBER() OVER (PARTITION BY lat, lon) AS rn "
+ "FROM read_parquet('" + parquetPath + "')) WHERE rn = 1");
copyToParquet(stmt, "SELECT * FROM places", refOut);
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM places")) {
rs.next();
int n = rs.getInt(1);
// Return as [lats, lons] flat arrays
double[] lats = new double[n];
double[] lons = new double[n];
try (ResultSet data = stmt.executeQuery("SELECT lat, lon FROM places")) {
int i = 0;
while (data.next()) {
lats[i] = data.getDouble(1);
lons[i] = data.getDouble(2);
i++;
}
}
return new double[][]{lats, lons};
}
}
}
/** Write postcode travel times as a ZSTD-compressed parquet (atomic via tmp + rename). */
static void writeTravelTimes(DuckDBConnection conn, Path outPath, String[] postcodes, short[] times)
throws Exception {
Path tmp = outPath.resolveSibling(outPath.getFileName() + ".tmp");
try (Statement stmt = conn.createStatement()) {
stmt.execute("DROP TABLE IF EXISTS t");
stmt.execute("CREATE TABLE t (pcds VARCHAR, travel_minutes SMALLINT)");
}
try (DuckDBAppender appender = conn.createAppender("main", "t")) {
for (int i = 0; i < postcodes.length; i++) {
appender.beginRow();
appender.append(postcodes[i]);
appender.append(times[i]);
appender.endRow();
}
}
try (Statement stmt = conn.createStatement()) {
stmt.execute("COPY t TO '" + tmp.toAbsolutePath() + "' (FORMAT PARQUET, COMPRESSION ZSTD)");
}
Files.move(tmp, outPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
}
/** Create a new in-memory DuckDB connection (for use as a per-thread reusable connection). */
static DuckDBConnection connect() throws Exception {
return (DuckDBConnection) DriverManager.getConnection("jdbc:duckdb:");
}
private static void copyToParquet(Statement stmt, String query, Path outPath) throws Exception {
stmt.execute("COPY (" + query + ") TO '" + outPath.toAbsolutePath()
+ "' (FORMAT PARQUET, COMPRESSION ZSTD)");
}
}