More
This commit is contained in:
parent
128b3191e7
commit
03445188ea
54 changed files with 596953 additions and 3577 deletions
112
r5-java/src/main/java/propertymap/Parquet.java
Normal file
112
r5-java/src/main/java/propertymap/Parquet.java
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
package propertymap;
|
||||
|
||||
import org.duckdb.DuckDBAppender;
|
||||
import org.duckdb.DuckDBConnection;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.Statement;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** DuckDB-based parquet I/O. */
|
||||
public class Parquet {
|
||||
|
||||
record Postcodes(String[] codes, double[] lats, double[] lons) {}
|
||||
|
||||
static {
|
||||
try { Class.forName("org.duckdb.DuckDBDriver"); }
|
||||
catch (ClassNotFoundException e) { throw new RuntimeException(e); }
|
||||
}
|
||||
|
||||
/** Load England postcodes, write reference parquet, return codes + flat lat/lon arrays. */
|
||||
static Postcodes loadEnglandPostcodes(String parquetPath, Path refOut) throws Exception {
|
||||
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
|
||||
stmt.execute("CREATE TABLE postcodes AS SELECT pcds, lat, \"long\" FROM read_parquet('"
|
||||
+ parquetPath + "') WHERE ctry = 'E92000001'");
|
||||
copyToParquet(stmt, "SELECT * FROM postcodes", refOut);
|
||||
|
||||
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM postcodes")) {
|
||||
rs.next();
|
||||
int n = rs.getInt(1);
|
||||
String[] codes = new String[n];
|
||||
double[] lats = new double[n];
|
||||
double[] lons = new double[n];
|
||||
|
||||
try (ResultSet data = stmt.executeQuery("SELECT pcds, lat, \"long\" FROM postcodes")) {
|
||||
int i = 0;
|
||||
while (data.next()) {
|
||||
codes[i] = data.getString(1);
|
||||
lats[i] = data.getDouble(2);
|
||||
lons[i] = data.getDouble(3);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return new Postcodes(codes, lats, lons);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Load places deduplicated by lat/lon, write reference parquet, return flat lat/lon arrays. */
|
||||
static double[][] loadPlaces(String parquetPath, Path refOut) throws Exception {
|
||||
try (DuckDBConnection conn = connect(); Statement stmt = conn.createStatement()) {
|
||||
stmt.execute("CREATE TABLE places AS SELECT * EXCLUDE (rn) FROM ("
|
||||
+ "SELECT *, ROW_NUMBER() OVER (PARTITION BY lat, lon) AS rn "
|
||||
+ "FROM read_parquet('" + parquetPath + "')) WHERE rn = 1");
|
||||
copyToParquet(stmt, "SELECT * FROM places", refOut);
|
||||
|
||||
try (ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM places")) {
|
||||
rs.next();
|
||||
int n = rs.getInt(1);
|
||||
// Return as [lats, lons] flat arrays
|
||||
double[] lats = new double[n];
|
||||
double[] lons = new double[n];
|
||||
|
||||
try (ResultSet data = stmt.executeQuery("SELECT lat, lon FROM places")) {
|
||||
int i = 0;
|
||||
while (data.next()) {
|
||||
lats[i] = data.getDouble(1);
|
||||
lons[i] = data.getDouble(2);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return new double[][]{lats, lons};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Write postcode travel times as a ZSTD-compressed parquet (atomic via tmp + rename). */
|
||||
static void writeTravelTimes(DuckDBConnection conn, Path outPath, String[] postcodes, short[] times)
|
||||
throws Exception {
|
||||
Path tmp = outPath.resolveSibling(outPath.getFileName() + ".tmp");
|
||||
try (Statement stmt = conn.createStatement()) {
|
||||
stmt.execute("DROP TABLE IF EXISTS t");
|
||||
stmt.execute("CREATE TABLE t (pcds VARCHAR, travel_minutes SMALLINT)");
|
||||
}
|
||||
try (DuckDBAppender appender = conn.createAppender("main", "t")) {
|
||||
for (int i = 0; i < postcodes.length; i++) {
|
||||
appender.beginRow();
|
||||
appender.append(postcodes[i]);
|
||||
appender.append(times[i]);
|
||||
appender.endRow();
|
||||
}
|
||||
}
|
||||
try (Statement stmt = conn.createStatement()) {
|
||||
stmt.execute("COPY t TO '" + tmp.toAbsolutePath() + "' (FORMAT PARQUET, COMPRESSION ZSTD)");
|
||||
}
|
||||
Files.move(tmp, outPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
|
||||
}
|
||||
|
||||
/** Create a new in-memory DuckDB connection (for use as a per-thread reusable connection). */
|
||||
static DuckDBConnection connect() throws Exception {
|
||||
return (DuckDBConnection) DriverManager.getConnection("jdbc:duckdb:");
|
||||
}
|
||||
|
||||
private static void copyToParquet(Statement stmt, String query, Path outPath) throws Exception {
|
||||
stmt.execute("COPY (" + query + ") TO '" + outPath.toAbsolutePath()
|
||||
+ "' (FORMAT PARQUET, COMPRESSION ZSTD)");
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue