Fmt
This commit is contained in:
parent
479ef92236
commit
c38d654ac7
44 changed files with 2526 additions and 701 deletions
|
|
@ -40,7 +40,9 @@ def download_and_convert(output_path: Path) -> None:
|
|||
df = pl.concat(frames)
|
||||
print(f"Total rows: {df.height}")
|
||||
|
||||
result = df.rename({"GEOGRAPHY_CODE": "lsoa21", "OBS_VALUE": "population"}).with_columns(
|
||||
result = df.rename(
|
||||
{"GEOGRAPHY_CODE": "lsoa21", "OBS_VALUE": "population"}
|
||||
).with_columns(
|
||||
pl.col("population").cast(pl.UInt32),
|
||||
)
|
||||
|
||||
|
|
@ -48,7 +50,9 @@ def download_and_convert(output_path: Path) -> None:
|
|||
result = result.filter(pl.col("lsoa21").str.starts_with("E"))
|
||||
|
||||
print(f"England LSOAs: {result.height}")
|
||||
print(f"Population range: {result['population'].min()} - {result['population'].max()}")
|
||||
print(
|
||||
f"Population range: {result['population'].min()} - {result['population'].max()}"
|
||||
)
|
||||
print(f"Mean population: {result['population'].mean():.0f}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
|
|
|||
|
|
@ -119,7 +119,11 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
station_tag = tags.get("station", "")
|
||||
network = tags.get("network", "").lower()
|
||||
# Skip tram stops
|
||||
if station_tag == "light_rail" or "tramlink" in network or "tram" in network:
|
||||
if (
|
||||
station_tag == "light_rail"
|
||||
or "tramlink" in network
|
||||
or "tram" in network
|
||||
):
|
||||
return
|
||||
display_name = _station_display_name(name, tags)
|
||||
self._add(display_name, "station", lat, lon, population)
|
||||
|
|
@ -131,9 +135,7 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file")
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
|
|
|
|||
|
|
@ -111,9 +111,7 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file")
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
|
|
|
|||
|
|
@ -99,10 +99,14 @@ def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
|
|||
combined = pl.concat(frames)
|
||||
|
||||
# Remap old LA codes to new unitary authority codes and average medians
|
||||
combined = combined.with_columns(
|
||||
pl.col("area_code").replace(LA_CONSOLIDATION),
|
||||
).group_by("area_code", "bedrooms").agg(
|
||||
pl.col("median_monthly_rent").mean(),
|
||||
combined = (
|
||||
combined.with_columns(
|
||||
pl.col("area_code").replace(LA_CONSOLIDATION),
|
||||
)
|
||||
.group_by("area_code", "bedrooms")
|
||||
.agg(
|
||||
pl.col("median_monthly_rent").mean(),
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Combined: {combined.shape}")
|
||||
|
|
|
|||
|
|
@ -13,9 +13,7 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
|||
|
||||
def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
|
||||
df = pl.read_parquet(postcodes_path, columns=["Postcode"])
|
||||
outcodes = sorted(
|
||||
set(df["Postcode"].str.split(" ").list.first().to_list()) - {""}
|
||||
)
|
||||
outcodes = sorted(set(df["Postcode"].str.split(" ").list.first().to_list()) - {""})
|
||||
print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")
|
||||
|
||||
mapping: dict[str, str] = {}
|
||||
|
|
@ -28,11 +26,9 @@ def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
|
|||
data = resp.json()
|
||||
found = False
|
||||
for m in data.get("matches", []):
|
||||
if (
|
||||
m["type"] == "OUTCODE"
|
||||
and m["displayName"].upper().replace(" ", "")
|
||||
== oc.upper().replace(" ", "")
|
||||
):
|
||||
if m["type"] == "OUTCODE" and m["displayName"].upper().replace(
|
||||
" ", ""
|
||||
) == oc.upper().replace(" ", ""):
|
||||
mapping[oc] = str(m["id"])
|
||||
found = True
|
||||
break
|
||||
|
|
@ -57,9 +53,7 @@ def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
|
|||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch Rightmove outcode ID mapping"
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Fetch Rightmove outcode ID mapping")
|
||||
parser.add_argument(
|
||||
"--postcodes", type=Path, required=True, help="postcode.parquet path"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -64,7 +64,9 @@ def ensure_pmtiles_cli(bin_path: Path, version: str) -> None:
|
|||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output .pmtiles path")
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output .pmtiles path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pmtiles-version", default="1.22.3", help="go-pmtiles release version"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -56,7 +56,9 @@ NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/time
|
|||
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
|
||||
|
||||
|
||||
def _download_http(url: str, dest: Path, *, desc: str, headers: dict | None = None) -> None:
|
||||
def _download_http(
|
||||
url: str, dest: Path, *, desc: str, headers: dict | None = None
|
||||
) -> None:
|
||||
"""Stream-download a URL to a file with progress bar."""
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = dest.with_suffix(dest.suffix + ".tmp")
|
||||
|
|
@ -117,9 +119,10 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
|||
return
|
||||
|
||||
print("Cleaning GTFS for R5 compatibility...")
|
||||
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
||||
dst, "w", zipfile.ZIP_DEFLATED
|
||||
) as zout:
|
||||
with (
|
||||
zipfile.ZipFile(src, "r") as zin,
|
||||
zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout,
|
||||
):
|
||||
for info in zin.infolist():
|
||||
if info.filename == "stop_times.txt":
|
||||
dropped = 0
|
||||
|
|
@ -127,7 +130,9 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
|||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
arr_idx = cols.index("arrival_time") if "arrival_time" in cols else -1
|
||||
arr_idx = (
|
||||
cols.index("arrival_time") if "arrival_time" in cols else -1
|
||||
)
|
||||
dep_idx = (
|
||||
cols.index("departure_time") if "departure_time" in cols else -1
|
||||
)
|
||||
|
|
@ -179,7 +184,9 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
|||
year = int(date_val[:4])
|
||||
if year > 2100:
|
||||
parts[i] = "20991231"
|
||||
print(f" feed_info: capped end_date {date_val} → 20991231")
|
||||
print(
|
||||
f" feed_info: capped end_date {date_val} → 20991231"
|
||||
)
|
||||
fixed_lines.append(",".join(parts))
|
||||
zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n")
|
||||
else:
|
||||
|
|
@ -334,7 +341,9 @@ def convert_high_freq_to_frequency_based(
|
|||
end_secs = trips[-1][1] + int(median_hw)
|
||||
headway_rounded = max(60, round(median_hw / 60) * 60)
|
||||
|
||||
frequency_entries.append((template_trip_id, start_secs, end_secs, headway_rounded))
|
||||
frequency_entries.append(
|
||||
(template_trip_id, start_secs, end_secs, headway_rounded)
|
||||
)
|
||||
for trip_id, _ in trips[1:]:
|
||||
trips_to_remove.add(trip_id)
|
||||
groups_converted += 1
|
||||
|
|
@ -344,9 +353,10 @@ def convert_high_freq_to_frequency_based(
|
|||
print(f" Created {len(frequency_entries)} frequency entries")
|
||||
|
||||
# Step 5: Write modified GTFS
|
||||
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
||||
dst, "w", zipfile.ZIP_DEFLATED
|
||||
) as zout:
|
||||
with (
|
||||
zipfile.ZipFile(src, "r") as zin,
|
||||
zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout,
|
||||
):
|
||||
for info in zin.infolist():
|
||||
if info.filename == "trips.txt":
|
||||
with zin.open(info) as f:
|
||||
|
|
@ -466,15 +476,22 @@ def download_national_rail_cif(raw_dir: Path) -> Path | None:
|
|||
email = os.environ.get("NATIONAL_RAIL_EMAIL")
|
||||
password = os.environ.get("NATIONAL_RAIL_PASSWORD")
|
||||
if not email or not password:
|
||||
print("Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail")
|
||||
print(
|
||||
"Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail"
|
||||
)
|
||||
return None
|
||||
|
||||
print("Authenticating with National Rail Open Data...")
|
||||
auth_data = urllib.parse.urlencode({"username": email, "password": password}).encode()
|
||||
auth_data = urllib.parse.urlencode(
|
||||
{"username": email, "password": password}
|
||||
).encode()
|
||||
auth_req = urllib.request.Request(
|
||||
NR_AUTH_URL,
|
||||
data=auth_data,
|
||||
headers={"User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded"},
|
||||
headers={
|
||||
"User-Agent": USER_AGENT,
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
)
|
||||
with urllib.request.urlopen(auth_req) as resp:
|
||||
token_data = json.loads(resp.read())
|
||||
|
|
@ -565,9 +582,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
|||
coords_fixed = 0
|
||||
route_types_fixed = 0
|
||||
|
||||
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
||||
dst, "w", zipfile.ZIP_DEFLATED
|
||||
) as zout:
|
||||
with (
|
||||
zipfile.ZipFile(src, "r") as zin,
|
||||
zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout,
|
||||
):
|
||||
for info in zin.infolist():
|
||||
# Skip non-standard links.txt
|
||||
if info.filename == "links.txt":
|
||||
|
|
@ -581,8 +599,12 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
|||
trip_id_idx = cols.index("trip_id")
|
||||
stop_id_idx = cols.index("stop_id")
|
||||
seq_idx = cols.index("stop_sequence")
|
||||
pickup_idx = cols.index("pickup_type") if "pickup_type" in cols else -1
|
||||
dropoff_idx = cols.index("drop_off_type") if "drop_off_type" in cols else -1
|
||||
pickup_idx = (
|
||||
cols.index("pickup_type") if "pickup_type" in cols else -1
|
||||
)
|
||||
dropoff_idx = (
|
||||
cols.index("drop_off_type") if "drop_off_type" in cols else -1
|
||||
)
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
|
|
@ -769,16 +791,27 @@ def _docker_run_dtd2mysql(
|
|||
) -> None:
|
||||
"""Run dtd2mysql in a Node.js container on the same Docker network as MariaDB."""
|
||||
cmd = [
|
||||
"docker", "run", "--rm", "--network", network,
|
||||
"-e", f"DATABASE_HOSTNAME={db_container}",
|
||||
"-e", "DATABASE_USERNAME=root",
|
||||
"-e", "DATABASE_PASSWORD=root",
|
||||
"-e", "DATABASE_NAME=dtd",
|
||||
"docker",
|
||||
"run",
|
||||
"--rm",
|
||||
"--network",
|
||||
network,
|
||||
"-e",
|
||||
f"DATABASE_HOSTNAME={db_container}",
|
||||
"-e",
|
||||
"DATABASE_USERNAME=root",
|
||||
"-e",
|
||||
"DATABASE_PASSWORD=root",
|
||||
"-e",
|
||||
"DATABASE_NAME=dtd",
|
||||
]
|
||||
for v in volumes:
|
||||
cmd.extend(["-v", v])
|
||||
# Install zip (needed for --gtfs-zip) then run dtd2mysql
|
||||
inner = "apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql " + " ".join(args)
|
||||
inner = (
|
||||
"apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql "
|
||||
+ " ".join(args)
|
||||
)
|
||||
cmd.extend(["node:20", "bash", "-c", inner])
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
|
|
@ -805,11 +838,17 @@ def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
|||
subprocess.run(["docker", "network", "create", network], capture_output=True)
|
||||
subprocess.run(
|
||||
[
|
||||
"docker", "run", "-d",
|
||||
"--name", db_container,
|
||||
"--network", network,
|
||||
"-e", "MARIADB_ROOT_PASSWORD=root",
|
||||
"-e", "MARIADB_DATABASE=dtd",
|
||||
"docker",
|
||||
"run",
|
||||
"-d",
|
||||
"--name",
|
||||
db_container,
|
||||
"--network",
|
||||
network,
|
||||
"-e",
|
||||
"MARIADB_ROOT_PASSWORD=root",
|
||||
"-e",
|
||||
"MARIADB_DATABASE=dtd",
|
||||
"mariadb:latest",
|
||||
],
|
||||
check=True,
|
||||
|
|
@ -820,7 +859,16 @@ def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
|||
print(" Waiting for MariaDB to be ready...")
|
||||
for attempt in range(30):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", db_container, "mariadb", "-uroot", "-proot", "-e", "SELECT 1"],
|
||||
[
|
||||
"docker",
|
||||
"exec",
|
||||
db_container,
|
||||
"mariadb",
|
||||
"-uroot",
|
||||
"-proot",
|
||||
"-e",
|
||||
"SELECT 1",
|
||||
],
|
||||
capture_output=True,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
|
|
@ -833,14 +881,16 @@ def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
|||
|
||||
print("Importing CIF timetable into MariaDB...")
|
||||
_docker_run_dtd2mysql(
|
||||
network, db_container,
|
||||
network,
|
||||
db_container,
|
||||
volumes=[f"{raw_abs}:/data:ro"],
|
||||
args=["--timetable", "/data/national_rail_cif.zip"],
|
||||
)
|
||||
|
||||
print("Exporting GTFS from MariaDB...")
|
||||
_docker_run_dtd2mysql(
|
||||
network, db_container,
|
||||
network,
|
||||
db_container,
|
||||
volumes=[f"{raw_abs}:/output"],
|
||||
args=["--gtfs-zip", "/output/national_rail_gtfs_raw.zip"],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -94,11 +94,18 @@ def _build(
|
|||
|
||||
# Remap terminated postcodes to nearest active successor
|
||||
postcode_mapping = build_postcode_mapping(arcgis_path)
|
||||
wide = wide.join(
|
||||
postcode_mapping.lazy(), left_on="postcode", right_on="old_postcode", how="left"
|
||||
).with_columns(
|
||||
pl.coalesce("new_postcode", "postcode").alias("postcode"),
|
||||
).drop("new_postcode")
|
||||
wide = (
|
||||
wide.join(
|
||||
postcode_mapping.lazy(),
|
||||
left_on="postcode",
|
||||
right_on="old_postcode",
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
pl.coalesce("new_postcode", "postcode").alias("postcode"),
|
||||
)
|
||||
.drop("new_postcode")
|
||||
)
|
||||
|
||||
arcgis = (
|
||||
pl.scan_parquet(arcgis_path)
|
||||
|
|
@ -252,16 +259,18 @@ def _build(
|
|||
.otherwise(pl.col("pp_property_type"))
|
||||
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
|
||||
# collapse terrace sub-types, and fold rare types into "Other"
|
||||
.replace({
|
||||
"Flat": "Flats/Maisonettes",
|
||||
"Maisonette": "Flats/Maisonettes",
|
||||
"End-Terrace": "Terraced",
|
||||
"Mid-Terrace": "Terraced",
|
||||
"Enclosed End-Terrace": "Terraced",
|
||||
"Enclosed Mid-Terrace": "Terraced",
|
||||
"Bungalow": "Other",
|
||||
"Park home": "Other",
|
||||
})
|
||||
.replace(
|
||||
{
|
||||
"Flat": "Flats/Maisonettes",
|
||||
"Maisonette": "Flats/Maisonettes",
|
||||
"End-Terrace": "Terraced",
|
||||
"Mid-Terrace": "Terraced",
|
||||
"Enclosed End-Terrace": "Terraced",
|
||||
"Enclosed Mid-Terrace": "Terraced",
|
||||
"Bungalow": "Other",
|
||||
"Park home": "Other",
|
||||
}
|
||||
)
|
||||
.alias("property_type")
|
||||
)
|
||||
|
||||
|
|
@ -426,10 +435,16 @@ def main():
|
|||
help="Census 2021 population by LSOA parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
|
||||
"--output-postcodes",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output postcode parquet file path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-properties", type=Path, required=True, help="Output properties parquet file path"
|
||||
"--output-properties",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output properties parquet file path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
|
|||
|
|
@ -454,9 +454,7 @@ class TestFillHoles:
|
|||
hole1 = [(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)]
|
||||
outer2 = [(60, 60), (110, 60), (110, 110), (60, 110), (60, 60)]
|
||||
hole2 = [(70, 70), (80, 70), (80, 80), (70, 80), (70, 70)]
|
||||
mp = MultiPolygon(
|
||||
[Polygon(outer1, [hole1]), Polygon(outer2, [hole2])]
|
||||
)
|
||||
mp = MultiPolygon([Polygon(outer1, [hole1]), Polygon(outer2, [hole2])])
|
||||
result = _fill_holes(mp)
|
||||
assert result.geom_type == "MultiPolygon"
|
||||
for p in result.geoms:
|
||||
|
|
|
|||
|
|
@ -112,7 +112,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
|
||||
|
||||
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
||||
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
|
||||
valid = (
|
||||
np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
|
||||
)
|
||||
actual = actual[valid]
|
||||
predicted = predicted[valid]
|
||||
|
||||
|
|
@ -176,7 +178,10 @@ def main():
|
|||
"--input", type=Path, required=True, help="Path to properties.parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcodes", type=Path, required=True, help="Path to postcode.parquet (for lat/lon)"
|
||||
"--postcodes",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to postcode.parquet (for lat/lon)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output backtest_results.parquet"
|
||||
|
|
@ -185,7 +190,9 @@ def main():
|
|||
|
||||
# Build index from pre-test data only (temporal holdout)
|
||||
print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
|
||||
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes)
|
||||
index = build_index(
|
||||
args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes
|
||||
)
|
||||
print(
|
||||
f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups"
|
||||
|
|
@ -233,7 +240,9 @@ def main():
|
|||
knn_est = knn_psm * fa * temporal_adj
|
||||
|
||||
n_knn = int((np.isfinite(knn_est) & (knn_est > 0)).sum())
|
||||
print(f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)")
|
||||
print(
|
||||
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# Blend: (1-w)*index + w*kNN where both available
|
||||
index_est = test["predicted"].to_numpy().astype(np.float64)
|
||||
|
|
|
|||
|
|
@ -107,9 +107,7 @@ def main():
|
|||
pl.when(has_price)
|
||||
.then(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (
|
||||
pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp")
|
||||
)
|
||||
* (pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp"))
|
||||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
)
|
||||
|
|
|
|||
|
|
@ -105,9 +105,7 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
|
|||
.alias("log_ratio"),
|
||||
(
|
||||
1.0
|
||||
/ (pl.col("frac_year2") - pl.col("frac_year1"))
|
||||
.cast(pl.Float64)
|
||||
.sqrt()
|
||||
/ (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
|
||||
).alias("weight"),
|
||||
)
|
||||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
|
|
@ -453,8 +451,12 @@ def main():
|
|||
description="Build improved repeat-sales price index"
|
||||
)
|
||||
parser.add_argument("--input", type=Path, required=True)
|
||||
parser.add_argument("--postcodes", type=Path, required=True,
|
||||
help="Path to postcode.parquet (for lat/lon centroids)")
|
||||
parser.add_argument(
|
||||
"--postcodes",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to postcode.parquet (for lat/lon centroids)",
|
||||
)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
|
|||
|
|
@ -43,48 +43,39 @@ def build_knn_pool(
|
|||
"""
|
||||
print("Building kNN pool...")
|
||||
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
|
||||
query = (
|
||||
lf
|
||||
.select(
|
||||
"Postcode",
|
||||
"Property type",
|
||||
"lat",
|
||||
"lon",
|
||||
"Total floor area (sqm)",
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
)
|
||||
.filter(
|
||||
pl.col("lat").is_not_null(),
|
||||
pl.col("lon").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Last known price") > 0,
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("Date of last transaction").is_not_null(),
|
||||
)
|
||||
query = lf.select(
|
||||
"Postcode",
|
||||
"Property type",
|
||||
"lat",
|
||||
"lon",
|
||||
"Total floor area (sqm)",
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
).filter(
|
||||
pl.col("lat").is_not_null(),
|
||||
pl.col("lon").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Last known price") > 0,
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("Date of last transaction").is_not_null(),
|
||||
)
|
||||
if max_sale_year is not None:
|
||||
query = query.filter(
|
||||
pl.col("Date of last transaction").dt.year() < max_sale_year
|
||||
)
|
||||
|
||||
pool = (
|
||||
query.with_columns(
|
||||
sector_expr(),
|
||||
type_group_expr(),
|
||||
(
|
||||
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
|
||||
+ (
|
||||
pl.col("Date of last transaction").dt.month().cast(pl.Float64)
|
||||
- 1.0
|
||||
)
|
||||
/ 12.0
|
||||
).alias("_sale_fy"),
|
||||
pl.lit(ref_frac_year).alias("_ref_fy"),
|
||||
).collect()
|
||||
)
|
||||
pool = query.with_columns(
|
||||
sector_expr(),
|
||||
type_group_expr(),
|
||||
(
|
||||
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
|
||||
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
|
||||
/ 12.0
|
||||
).alias("_sale_fy"),
|
||||
pl.lit(ref_frac_year).alias("_ref_fy"),
|
||||
).collect()
|
||||
pool = pool.filter(pl.col("type_group").is_not_null())
|
||||
print(f" {len(pool):,} pool properties with lat/lon, floor area, price")
|
||||
|
||||
|
|
|
|||
|
|
@ -1085,7 +1085,9 @@ def transform(
|
|||
if cat not in all_set:
|
||||
mapped_but_absent.append(cat)
|
||||
if mapped_but_absent:
|
||||
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
|
||||
print(
|
||||
f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
|
||||
)
|
||||
|
||||
# Drop unwanted categories
|
||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||
|
|
|
|||
|
|
@ -27,7 +27,9 @@ def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
|
|||
return prep(geometry)
|
||||
|
||||
|
||||
def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray:
|
||||
def in_england_mask(
|
||||
geojson_path: Path, lats: np.ndarray, lngs: np.ndarray
|
||||
) -> np.ndarray:
|
||||
"""Vectorized check: which (lat, lng) points are within England.
|
||||
|
||||
Returns a boolean numpy array.
|
||||
|
|
|
|||
|
|
@ -106,7 +106,9 @@ def count_pois_per_postcode(
|
|||
if nearby is None:
|
||||
continue
|
||||
|
||||
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
|
||||
distances = haversine_km(
|
||||
poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
|
||||
)
|
||||
|
||||
within_mask = distances <= radius_km
|
||||
within_indices = nearby[within_mask]
|
||||
|
|
@ -179,7 +181,9 @@ def min_distance_per_postcode(
|
|||
if nearby is None:
|
||||
continue
|
||||
|
||||
distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
|
||||
distances = haversine_km(
|
||||
poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
|
||||
)
|
||||
|
||||
for group, cat_mask in category_masks.items():
|
||||
group_mask = cat_mask[nearby]
|
||||
|
|
|
|||
|
|
@ -15,26 +15,49 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
|
|||
"""
|
||||
arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")
|
||||
|
||||
active = arcgis.filter(pl.col("doterm").is_null()).select("pcds", "oseast1m", "osnrth1m").collect()
|
||||
terminated = arcgis.filter(pl.col("doterm").is_not_null()).select("pcds", "oseast1m", "osnrth1m").collect()
|
||||
active = (
|
||||
arcgis.filter(pl.col("doterm").is_null())
|
||||
.select("pcds", "oseast1m", "osnrth1m")
|
||||
.collect()
|
||||
)
|
||||
terminated = (
|
||||
arcgis.filter(pl.col("doterm").is_not_null())
|
||||
.select("pcds", "oseast1m", "osnrth1m")
|
||||
.collect()
|
||||
)
|
||||
|
||||
print(f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}")
|
||||
print(
|
||||
f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}"
|
||||
)
|
||||
|
||||
if terminated.height == 0:
|
||||
return pl.DataFrame({"old_postcode": pl.Series([], dtype=pl.Utf8), "new_postcode": pl.Series([], dtype=pl.Utf8)})
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"old_postcode": pl.Series([], dtype=pl.Utf8),
|
||||
"new_postcode": pl.Series([], dtype=pl.Utf8),
|
||||
}
|
||||
)
|
||||
|
||||
active_coords = np.column_stack([active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()])
|
||||
terminated_coords = np.column_stack([terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()])
|
||||
active_coords = np.column_stack(
|
||||
[active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()]
|
||||
)
|
||||
terminated_coords = np.column_stack(
|
||||
[terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()]
|
||||
)
|
||||
|
||||
tree = cKDTree(active_coords)
|
||||
distances, indices = tree.query(terminated_coords)
|
||||
|
||||
active_postcodes = active["pcds"]
|
||||
mapping = pl.DataFrame({
|
||||
"old_postcode": terminated["pcds"],
|
||||
"new_postcode": active_postcodes.gather(indices),
|
||||
})
|
||||
mapping = pl.DataFrame(
|
||||
{
|
||||
"old_postcode": terminated["pcds"],
|
||||
"new_postcode": active_postcodes.gather(indices),
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m")
|
||||
print(
|
||||
f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
|
||||
)
|
||||
|
||||
return mapping
|
||||
|
|
|
|||
|
|
@ -72,7 +72,9 @@ def test_no_pois_returns_zeros(postcodes):
|
|||
"category": pl.Series([], dtype=pl.String),
|
||||
}
|
||||
)
|
||||
result = count_pois_per_postcode(postcodes, empty_pois, groups=POI_GROUPS, radius_km=2.0)
|
||||
result = count_pois_per_postcode(
|
||||
postcodes, empty_pois, groups=POI_GROUPS, radius_km=2.0
|
||||
)
|
||||
|
||||
for group in POI_GROUPS:
|
||||
col = f"{group}_2km"
|
||||
|
|
@ -125,7 +127,9 @@ def test_min_distance_no_pois_returns_nan(postcodes):
|
|||
"category": pl.Series([], dtype=pl.String),
|
||||
}
|
||||
)
|
||||
result = min_distance_per_postcode(postcodes, empty_pois, groups={"train_tube": ["Rail station"]})
|
||||
result = min_distance_per_postcode(
|
||||
postcodes, empty_pois, groups={"train_tube": ["Rail station"]}
|
||||
)
|
||||
|
||||
assert "train_tube_nearest_km" in result.columns
|
||||
assert all(np.isnan(v) for v in result["train_tube_nearest_km"].to_list())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue