Merge multiple rainfall CSVs, validate the merge, reformat dates,
and compute monthly totals + long-term climatology.
$ python rainfall_climatology.py file1.csv file2.csv ...
--date-col COL name of the date column (default: date)
--rain-col COL name of the rainfall col (default: rain)
--threshold X frac of days to accept month (default: 0.9)
--merged-out FILE filename for merged CSV (default: merged_rainfall.csv)
--quiet suppress screen output
from __future__ import annotations
files: list[Path], date_col: str, rain_col: str
) -> tuple[pd.DataFrame, float]:
Read & concatenate all CSVs, sort chronologically, drop dupes.
Returns (merged_df, raw_sum), where raw_sum is the sum of rain from
all inputs prior to de-duplication.
pd.read_csv(f, parse_dates=[date_col], infer_datetime_format=True)
.rename(columns={date_col: "date", rain_col: "rain"})
.loc[:, ["date", "rain"]]
# accumulate raw sum (coerce non-numeric to NaN, skip NaNs)
df["rain"] = pd.to_numeric(df["rain"], errors="coerce")
raw_sum += df["rain"].sum(skipna=True)
raise ValueError("No CSV files supplied.")
pd.concat(frames, ignore_index=True)
.drop_duplicates(subset="date") # keep only one entry per date
merged["rain"] = pd.to_numeric(merged["rain"], errors="coerce")
def validate_merge(merged: pd.DataFrame, raw_sum: float) -> None:
Compare the sum of 'rain' in merged vs. the raw inputs.
merged_sum = merged["rain"].sum(skipna=True)
if abs(merged_sum - raw_sum) < 1e-6:
print(f"[ok] Validation passed: merged sum = {merged_sum:.3f}")
print(f"[warn] Mismatch detected:")
print(f" sum of inputs = {raw_sum:.3f}")
print(f" sum after merge = {merged_sum:.3f}")
# Uncomment to enforce failure:
# raise RuntimeError("Merged sum does not match sum of inputs")
def monthly_totals(df: pd.DataFrame, min_frac: float = 0.9) -> pd.DataFrame:
daily_count = df["rain"].resample("D").count()
daily_sum = df["rain"].resample("D").sum(min_count=1)
m_count = daily_count.resample("MS").sum()
m_sum = daily_sum.resample("MS").sum(min_count=1)
month_length = m_count.index.days_in_month
mask = (m_count / month_length) >= min_frac
.to_frame("monthly_total")
.assign(year=lambda x: x.index.year,
month=lambda x: x.index.month)
return good[["year", "month", "monthly_total"]]
def climatology(monthly_df: pd.DataFrame) -> pd.Series:
return monthly_df.groupby("month")["monthly_total"].mean().sort_index()
def main(argv: list[str] | None = None) -> None:
p = argparse.ArgumentParser(description="Merge rainfall CSVs and compute climatology.")
p.add_argument("csv_files", nargs="+", type=Path, help="Input CSV file(s)")
p.add_argument("--date-col", default="date", help="Date column name (default: date)")
p.add_argument("--rain-col", default="rain", help="Rainfall column name (default: rain)")
p.add_argument("--threshold", type=float, default=0.9,
metavar="X", help="Fraction of days required per month (default: 0.9)")
p.add_argument("--merged-out", default="merged_rainfall.csv", metavar="FILE",
help="Filename for the merged output CSV (default: merged_rainfall.csv)")
p.add_argument("--quiet", action="store_true", help="Suppress screen output")
args = p.parse_args(argv)
# 1. read, concat, sort ➔ get raw sum of all inputs
merged, raw_sum = read_concat_sort(args.csv_files, args.date_col, args.rain_col)
validate_merge(merged, raw_sum)
# 3. write merged file with MM-DD-YYYY date format
print(f"[info] Merged series saved to {args.merged_out} "
f"({len(merged):,} rows, {merged.index[0].date():%m-%d-%Y}–"
f"{merged.index[-1].date():%m-%d-%Y}).")
# 4. monthly totals per year
monthly_df = monthly_totals(merged, min_frac=args.threshold)
print("\nMonthly totals (filtered by completeness):")
print(monthly_df.to_string(index=False, formatters={"monthly_total": "{:.2f}".format}))
# 5. long-term mean for each calendar month
clim = climatology(monthly_df)
print("\nLong-term mean precipitation (mm per month):")
for m, v in clim.items():
print(f"{m:02d}: {v:.2f}")
return merged, monthly_df, clim
if __name__ == "__main__":