Source code for gcmprocpy.gpigen.core

"""Top-level orchestration: fetch -> process -> Dataset."""

from .dates import coerce_datetime, yesterday
from .processing import (
    build_kp,
    compute_end_trims,
    fill_fobs,
    running_average,
    trim,
)
from .dataset import build_dataset
from .sources import fetch
from datetime import timedelta



[docs]
def generate_gpi(
    start="1960-01-01",
    end=None,
    source="json",
    window=81,
    centered=True,
    status="def",
    cache_dir=None,
    verbose=False,
):
    """Generate a GPI ``xarray.Dataset`` for ``[start, end]``.

    Parameters
    ----------
    start, end : date-like or None
        Inclusive bounds (``YYYY-MM-DD``, ``YYYYDDD``, ISO, or ``datetime``).
        ``end=None`` defaults to yesterday. The output begins at ``start`` but,
        for a centered window, ends ``window // 2`` days before ``end`` (a
        centered average needs future data that does not yet exist).
    source : {"json", "textfile"}
        GFZ JSON API (default) or the locally-parsed 1932-onward text file.
    window : int
        Averaging window in days (default 81).
    centered : bool
        Centered (default) vs trailing average for ``f107a``.
    status : str
        GFZ ``status`` query param for the JSON API (default ``"def"``).
    cache_dir : str or None
        Where to drop the downloaded text file (``textfile`` source only).
    verbose : bool
        Print progress.

    Returns
    -------
    xarray.Dataset
        Use :func:`gpigen.save_gpi` to write it to NetCDF.
    """
    start_dt = coerce_datetime(start)
    end_dt = coerce_datetime(end, end_of_day=True) if end is not None else yesterday()

    half = window // 2
    start_remove = half if centered else window
    avg_end_invalid = half if centered else 0

    # Fetch extra lead-in days so the average is complete at `start`.
    fetch_start = start_dt - timedelta(days=start_remove)
    if verbose:
        print(
            f"Fetching {source} data {fetch_start.date()} -> {end_dt.date()} "
            f"(window={window}, {'centered' if centered else 'trailing'})"
        )
    fobs_data, kp_data = fetch(
        source, fetch_start, end_dt, status=status, cache_dir=cache_dir, verbose=verbose
    )

    if not fobs_data["Fobs"] or not kp_data["Kp"]:
        raise ValueError(
            f"No GPI data available for {start_dt.date()} -> {end_dt.date()} "
            f"(source={source!r}): the GFZ {source} source returned no records for "
            f"this range. Check the dates fall within the published archive and are "
            f"not in the future."
        )

    year_day, f107d, missing_dates = fill_fobs(fobs_data)
    if verbose and missing_dates:
        print(f"Filled {len(missing_dates)} missing F10.7 day(s): {missing_dates}")

    f107a = running_average(f107d, window, centered)
    kp, unique_dates = build_kp(kp_data)

    fobs_end, kp_end = compute_end_trims(year_day, unique_dates, avg_end_invalid)

    year_day = trim(year_day, start_remove, fobs_end)
    f107d = trim(f107d, start_remove, fobs_end)
    f107a = trim(f107a, start_remove, fobs_end)
    kp = trim(kp, start_remove, kp_end)

    if not (len(year_day) == len(f107d) == len(f107a) == len(kp)):
        raise RuntimeError(
            "Array length mismatch after trimming "
            f"(year_day={len(year_day)}, f107d={len(f107d)}, "
            f"f107a={len(f107a)}, kp={len(kp)})."
        )

    if len(year_day) == 0:
        kind = "centered" if centered else "trailing"
        raise ValueError(
            f"No GPI output days for {start_dt.date()} -> {end_dt.date()}: the "
            f"available data is shorter than the {window}-day {kind} averaging "
            f"window. Request a wider date range or a smaller window."
        )

    ds = build_dataset(year_day, f107d, f107a, kp, window, centered, missing_dates)
    if verbose:
        print(
            f"Built GPI dataset: {ds.attrs['yearday_beg']} -> "
            f"{ds.attrs['yearday_end']} ({len(year_day)} days)"
        )
    return ds