Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -352,6 +352,176 @@ def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
|
|||
return failures
|
||||
|
||||
|
||||
def _failures_for_postcode_features(path: Path) -> list[str]:
|
||||
"""Validate the postcode feature output: unique Postcode, non-null lat/lon
|
||||
inside the England bbox, ctry25cd == E92000001, and every '% ' column in
|
||||
[0, 100]. Mirrors the in-build invariant (merge._validate_postcode_feature_output)
|
||||
so a stale/contaminated file on disk cannot pass `make`.
|
||||
"""
|
||||
failures = _failures_for_parquet(path)
|
||||
if failures:
|
||||
return failures
|
||||
|
||||
try:
|
||||
names = pl.scan_parquet(path).collect_schema().names()
|
||||
required = {"Postcode", "lat", "lon", "ctry25cd"}
|
||||
missing = sorted(required - set(names))
|
||||
if missing:
|
||||
return [f"{path}: postcode features missing required columns: {missing}"]
|
||||
|
||||
pct_cols = [c for c in names if c.startswith("% ")]
|
||||
df = (
|
||||
pl.scan_parquet(path)
|
||||
.select(["Postcode", "lat", "lon", "ctry25cd", *pct_cols])
|
||||
.collect()
|
||||
)
|
||||
except Exception as exc:
|
||||
return [f"{path}: postcode features validation failed: {exc}"]
|
||||
|
||||
height = df.height
|
||||
if df["Postcode"].n_unique() != height:
|
||||
failures.append(
|
||||
f"{path}: Postcode is not unique "
|
||||
f"({height - df['Postcode'].n_unique():,} duplicate rows)"
|
||||
)
|
||||
|
||||
# England bounding box (generous): lat 49.5-60N, lon -8 to 2.5E.
|
||||
bad_coords = df.filter(
|
||||
pl.col("lat").is_null()
|
||||
| pl.col("lon").is_null()
|
||||
| ~pl.col("lat").is_between(49.5, 60.0)
|
||||
| ~pl.col("lon").is_between(-8.0, 2.5)
|
||||
)
|
||||
if bad_coords.height:
|
||||
sample = bad_coords.get_column("Postcode").head(10).to_list()
|
||||
failures.append(
|
||||
f"{path}: {bad_coords.height:,} rows have null or out-of-England "
|
||||
f"lat/lon; sample: {_format_samples(sample)}"
|
||||
)
|
||||
|
||||
bad_country = df.filter(pl.col("ctry25cd") != "E92000001")
|
||||
if bad_country.height:
|
||||
sample = bad_country.get_column("Postcode").head(10).to_list()
|
||||
failures.append(
|
||||
f"{path}: {bad_country.height:,} rows have ctry25cd != 'E92000001' "
|
||||
f"(non-England contamination); sample: {_format_samples(sample)}"
|
||||
)
|
||||
|
||||
for col in pct_cols:
|
||||
out_of_range = df.filter(
|
||||
pl.col(col).is_not_null() & ~pl.col(col).is_between(0.0, 100.0)
|
||||
).height
|
||||
if out_of_range:
|
||||
failures.append(
|
||||
f"{path}: {col!r} has {out_of_range:,} values outside [0, 100]"
|
||||
)
|
||||
|
||||
return failures
|
||||
|
||||
|
||||
def _failures_for_properties_subset(spec: str) -> list[str]:
|
||||
"""Validate that every properties Postcode exists in the postcode feature
|
||||
table (no orphan properties) and that numeric price columns are positive."""
|
||||
properties_path, postcode_path = _split_pair(spec, "properties subset")
|
||||
failures = _failures_for_parquet(properties_path) + _failures_for_parquet(
|
||||
postcode_path
|
||||
)
|
||||
if failures:
|
||||
return failures
|
||||
|
||||
try:
|
||||
postcode_set = _parquet_postcodes(postcode_path)
|
||||
property_set = _parquet_postcodes(properties_path)
|
||||
except Exception as exc:
|
||||
return [f"{properties_path} / {postcode_path}: subset check failed: {exc}"]
|
||||
|
||||
orphans = property_set - postcode_set
|
||||
if orphans:
|
||||
failures.append(
|
||||
f"{properties_path}: {len(orphans):,} property postcodes are absent from "
|
||||
f"{postcode_path}; sample: {_sample(orphans)}"
|
||||
)
|
||||
|
||||
# Positivity check for genuine numeric price columns only (skip nested/list
|
||||
# columns like historical_prices, which contain "price" in the name).
|
||||
try:
|
||||
schema = pl.scan_parquet(properties_path).collect_schema()
|
||||
numeric = {
|
||||
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
||||
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
||||
pl.Float32, pl.Float64,
|
||||
}
|
||||
price_cols = [
|
||||
c
|
||||
for c, dtype in schema.items()
|
||||
if ("price" in c.lower() or "rent" in c.lower()) and dtype in numeric
|
||||
]
|
||||
for col in price_cols:
|
||||
bad = (
|
||||
pl.scan_parquet(properties_path)
|
||||
.filter(pl.col(col).is_not_null() & (pl.col(col) <= 0))
|
||||
.select(pl.len())
|
||||
.collect()
|
||||
.item()
|
||||
)
|
||||
if bad:
|
||||
failures.append(
|
||||
f"{properties_path}: {col!r} has {bad:,} non-positive values"
|
||||
)
|
||||
except Exception as exc:
|
||||
failures.append(f"{properties_path}: price positivity check failed: {exc}")
|
||||
|
||||
return failures
|
||||
|
||||
|
||||
def _failures_for_price_index(path: Path) -> list[str]:
|
||||
"""Validate price_index.parquet structural integrity: required columns, a
|
||||
finite non-null log_index, and unique (sector, type_group, year) keys.
|
||||
|
||||
n_pairs == 0 is intentionally NOT treated as a failure: those rows are
|
||||
legitimate hedonic/shrinkage fallbacks for sectors with too few repeat-sale
|
||||
pairs.
|
||||
"""
|
||||
failures = _failures_for_parquet(path)
|
||||
if failures:
|
||||
return failures
|
||||
|
||||
try:
|
||||
names = pl.scan_parquet(path).collect_schema().names()
|
||||
required = {"sector", "type_group", "year", "log_index", "n_pairs"}
|
||||
missing = sorted(required - set(names))
|
||||
if missing:
|
||||
return [f"{path}: price index missing required columns: {missing}"]
|
||||
|
||||
stats = (
|
||||
pl.scan_parquet(path)
|
||||
.select(
|
||||
pl.len().alias("n"),
|
||||
pl.col("log_index").null_count().alias("null_log"),
|
||||
(~pl.col("log_index").is_finite()).sum().alias("nonfinite_log"),
|
||||
pl.struct("sector", "type_group", "year").n_unique().alias("unique_keys"),
|
||||
)
|
||||
.collect()
|
||||
.row(0, named=True)
|
||||
)
|
||||
except Exception as exc:
|
||||
return [f"{path}: price index validation failed: {exc}"]
|
||||
|
||||
if stats["null_log"]:
|
||||
failures.append(f"{path}: {stats['null_log']:,} rows have null log_index")
|
||||
if stats["nonfinite_log"]:
|
||||
failures.append(
|
||||
f"{path}: {stats['nonfinite_log']:,} rows have non-finite log_index"
|
||||
)
|
||||
if stats["unique_keys"] != stats["n"]:
|
||||
failures.append(
|
||||
f"{path}: (sector, type_group, year) is not unique "
|
||||
f"({stats['n'] - stats['unique_keys']:,} duplicate rows)"
|
||||
)
|
||||
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--file", action="append", default=[], type=Path)
|
||||
|
|
@ -385,6 +555,29 @@ def main() -> int:
|
|||
"GeoJSON postcodes: ARCGIS_PARQUET::DIR"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcode-features",
|
||||
action="append",
|
||||
default=[],
|
||||
type=Path,
|
||||
help=(
|
||||
"Validate a postcode feature parquet: unique Postcode, non-null "
|
||||
"lat/lon in England, ctry25cd=E92000001, '% ' columns in [0,100]"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--properties-subset",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Require properties postcodes to be a subset of postcode keys: PROPERTIES::POSTCODE",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--price-index",
|
||||
action="append",
|
||||
default=[],
|
||||
type=Path,
|
||||
help="Validate price_index.parquet: finite log_index and unique (sector,type_group,year)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
failures: list[str] = []
|
||||
|
|
@ -404,6 +597,12 @@ def main() -> int:
|
|||
failures.extend(_failures_for_postcode_boundary_match(spec))
|
||||
for spec in args.active_postcode_boundary_match:
|
||||
failures.extend(_failures_for_active_postcode_boundary_match(spec))
|
||||
for path in args.postcode_features:
|
||||
failures.extend(_failures_for_postcode_features(path))
|
||||
for spec in args.properties_subset:
|
||||
failures.extend(_failures_for_properties_subset(spec))
|
||||
for path in args.price_index:
|
||||
failures.extend(_failures_for_price_index(path))
|
||||
|
||||
if failures:
|
||||
print("Output validation failed:", file=sys.stderr)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue