Source code for gcmprocpy.imfgen.dataset

"""Assemble processed channels into an xarray Dataset and write NetCDF.

The variable set, dimension name (``ndata``) and attributes reproduce the
original ``imf_create.py`` / ``bcwind_imf.py`` output exactly (one extra pair of
convenience attributes -- ``yearday_beg`` / ``yearday_end`` -- is added so the
filename can be derived without re-deriving it from the data).
"""

import os
from datetime import datetime

import numpy as np
import xarray as xr

from .dates import datesec_of_day, to_yyyymmdd
from .processing import CHANNELS

# data-variable name + mask-variable name for each channel
VAR_NAMES = {"bx": "bx", "by": "by", "bz": "bz", "swden": "swden", "swvel": "swvel"}
MASK_NAMES = {"bx": "bxMask", "by": "byMask", "bz": "bzMask",
              "swden": "denMask", "swvel": "velMask"}
UNITS = {"bx": "nT", "by": "nT", "bz": "nT", "swden": "cm^{-3}", "swvel": "km/s"}
LONG_NAMES = {"bx": "IMF Bx", "by": "IMF By", "bz": "IMF Bz",
              "swden": "solar wind density", "swvel": "solar wind velocity"}
MASK_LONG_NAME = "Quality flag: 0=data derived from linear interpolation."

DEFAULT_PREFIX = {"omni": "imf_OMNI", "bcwind": "imf_bcwind"}

_SOURCE_ATTRS = {
    "omni": {
        "Description": ("10-minute average of OMNI data trailed by 1 minutes. "
                        "Sampled to minute output"),
        "Source": "Hourly OMNI combined 1AU IP Data",
        "url_reference": "https://omniweb.gsfc.nasa.gov/ow_min.html",
    },
    "bcwind": {
        "Description": "BCWIND.h5 to minute output IMF data",
        "Source": "bcwind.h5",
        "url_reference": "https://github.com/AnonNick/IMF",
    },
}



[docs]
def build_dataset(processed, dates, timestamps, source="omni", source_path=None,
                  model="tiegcm", datetimes=None):
    """Build the IMF ``xarray.Dataset``.

    ``processed`` maps each channel in :data:`CHANNELS` to ``(values, mask)``.
    ``dates`` is the ``YYYYDDD.frac`` float array; ``timestamps`` the ISO strings.

    ``model`` selects the target model's input format:

    - ``"tiegcm"`` (default): the TIE-GCM format on the ``ndata`` dimension, with
      ``date`` (``YYYYDDD.frac``) and the ISO ``timestamp`` -- reproduced exactly
      (see the module docstring).
    - ``"waccmx"``: the WACCM-X (CESM) format on an (unlimited-on-write) ``time``
      dimension, with ``date`` (``YYYYMMDD`` int), ``datefrac`` (the old
      ``YYYYDDD.frac``) and ``datesec`` (seconds of day). Requires ``datetimes``
      (the source ``datetime`` objects); there is no ISO ``timestamp`` variable.
    """
    dates = np.asarray(dates)
    ndata = len(dates)
    dim = "time" if model == "waccmx" else "ndata"
    data_vars = {}
    for name in CHANNELS:
        values, mask = processed[name]
        data_vars[VAR_NAMES[name]] = (
            dim, np.asarray(values, dtype=float),
            {"units": UNITS[name], "long_name": LONG_NAMES[name]},
        )
        data_vars[MASK_NAMES[name]] = (
            dim, np.asarray(mask, dtype="int8"),
            {"units": "boolean", "long_name": MASK_LONG_NAME},
        )

    if model == "waccmx":
        if datetimes is None:
            raise ValueError(
                "model='waccmx' requires datetimes= (the source datetime list)."
            )
        data_vars["date"] = (
            dim, np.array([to_yyyymmdd(t) for t in datetimes], dtype="int32"),
            {"long_name": "Current date (YYYYMMDD)", "units": ""},
        )
        data_vars["datefrac"] = (
            dim, dates.astype("float64"),
            {"long_name": "year-day plus fractional day: yyyyddd.frac", "units": ""},
        )
        data_vars["datesec"] = (
            dim, np.array([datesec_of_day(t) for t in datetimes], dtype="int32"),
            {"long_name": "Seconds of current date", "units": ""},
        )
        # 'time' is a bare dimension (no coordinate variable), as WACCM-X expects.
        ds = xr.Dataset(data_vars)
    else:
        data_vars["date"] = (
            dim, dates,
            {"long_name": "year-day plus fractional day: yyyyddd.frac"},
        )
        data_vars["timestamp"] = (
            dim, np.asarray(timestamps),
            {"long_name": "Timestamp of the data: YYYY-MM-DDTHH:MM:SS"},
        )
        ds = xr.Dataset(data_vars, coords={"ndata": np.arange(ndata)})

    attrs = dict(_SOURCE_ATTRS[source])
    if source == "bcwind" and source_path:
        attrs["Source"] = str(source_path)
    attrs["CreationTime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
    attrs["Version"] = "1.0.0"
    attrs["CreatedBy"] = "nikhilr"
    attrs["data_source"] = source
    if model != "tiegcm":
        attrs["model"] = model      # only stamped for non-default formats
    attrs["yearday_beg"] = int(dates[0])
    attrs["yearday_end"] = int(dates[-1])
    # url_reference last, matching the originals' ordering
    attrs["url_reference"] = _SOURCE_ATTRS[source]["url_reference"]
    ds.attrs.update(attrs)
    return ds




[docs]
def imf_filename(ds, prefix=None):
    """``<prefix>_<begYYYYDDD>-<endYYYYDDD>.nc`` from the dataset's bounds.

    For ``model='waccmx'`` datasets the default prefix gains a ``WACCMX`` tag
    (e.g. ``imf_OMNI_WACCMX_...``); an explicit *prefix* is used verbatim.
    """
    if prefix is None:
        prefix = DEFAULT_PREFIX.get(ds.attrs.get("data_source", "omni"), "imf")
        if ds.attrs.get("model") == "waccmx":
            prefix = f"{prefix}_WACCMX"
    beg = int(ds.attrs["yearday_beg"])
    end = int(ds.attrs["yearday_end"])
    return f"{prefix}_{beg}-{end}.nc"




[docs]
def save_imf(ds, output_dir=".", prefix=None, path=None):
    """Write ``ds`` to NetCDF and return the path written.

    ``path`` overrides the auto-generated ``<prefix>_<beg>-<end>.nc`` name. (For
    per-year output, generate each year with :func:`imfgen.generate_imf_years`
    and call this once per dataset -- see ``imfgen --split-years``.)
    """
    if path is None:
        os.makedirs(output_dir, exist_ok=True)
        path = os.path.join(output_dir, imf_filename(ds, prefix))
    else:
        parent = os.path.dirname(path)
        if parent:
            os.makedirs(parent, exist_ok=True)
    unlimited = ["time"] if ds.attrs.get("model") == "waccmx" else None
    ds.to_netcdf(path=path, unlimited_dims=unlimited)
    return path