Source code for stompy.xr_utils

import logging
log=logging.getLogger('xr_utils')

from collections import OrderedDict

import xarray as xr
import six
import numpy as np

[docs]def gradient(ds,varname,coord):
    # rather than assume that consecutive data points are valid,
    # fit a line to the data values per water column
    daC,daz = xr.broadcast(ds[varname],ds[coord])

    z=daz.values
    C=daC.values
    assert z.shape==C.shape

    newdims=[dim for dim in daC.dims if dim!=coord]
    newshape=[len(daC[dim]) for dim in newdims]
    newdims,newshape
    result=np.zeros(newshape,'f8')

    for idx in np.ndindex(*newshape):
        colC=C[idx]
        colz=z[idx]
        assert colC.ndim==colz.ndim==1 # sanity
        valid=np.isfinite(colC*colz)
        if np.sum(valid)>1:
            mb=np.polyfit(colz[valid],colC[valid],1)
            result[idx]=mb[0]
        else:
            result[idx]=np.nan
    return xr.DataArray(result,coords=[ds[dim] for dim in newdims],name='d%s/d%s'%(varname,coord))

[docs]def find_var(nc,pred):
    for fld in nc:
        try:
            if pred(nc[fld]):
                return fld
        except:
            pass
    return None

[docs]def redimension(ds,new_dims,
                intragroup_dim=None,
                inplace=False,
                save_mapping=False):
    """
    copy ds, making new_dims into the defining
    dimensions for variables which share shape with new_dims.

    each entry in new_dims must have the same dimension, and
    must be unidimensional

    Example:
    Dataset:
      coordinates
        sample  [0,1,2..100]
      data variables
        date(sample)      [...]
        station(sample)   [...]
        depth(sample)     [...]
        salinity(sample)  [...]

    We'd like to end up with
      salinity(date,station,profile_sample)
      depth(date,station,profile_sample)

    Or
    Dataset:
      coordinates
        time [...]
        item [...]
      data variables
        x(item)        [...]
        y(item)        [...]
        z(item)        [...]
        salinity(time,time) [...,...]

    Which you want to become

    Dataset:
      coordinates
        time [.]
        x [.]
        y [.]
        zi [.]
      data variables
        z(x,y,zi)             [...]
        salinity(time,x,y,zi) [....]

    In other words, replace item with three orthogonal dimensions.  Two of the
    orthogonal dimensions have existing coordinates, and the third is an index
    to elements within the bin defined by x,y.

    save_mapping: create an additional variable in the output which stores the
    mapping of the linear dimension to the new, orthogonal dimensions

    intragroup_dim: introduce an additional dimension to enumerate the original
    data which map to the same new coordinates.
    """
    if not inplace:
        ds=ds.copy()

    lin_dim=ds[new_dims[0]].dims[0]# the original linear dimension
    orig_dims=[ ds[vname].values.copy()
                for vname in new_dims ]
    Norig=len(orig_dims[0]) # length of the original, linear dimension

    uni_new_dims=[ np.unique(od) for od in orig_dims]

    for und in uni_new_dims:
        try:
            if np.any(und<0):
                log.warning("New dimensions have negative values -- will continue but you probably want to drop those first")
        except TypeError:
            # some versions of numpy/xarray will not compare times to 0,
            # triggering a TypeError
            pass

    # note that this is just the shape that will replace occurences of lin_dim
    new_shape=[len(und) for und in uni_new_dims]

    # build up an index array 
    new_idxs=[ np.searchsorted(und,od)
               for und,od in zip( uni_new_dims, orig_dims ) ]

    if intragroup_dim is not None:
        # here we need to first count up the max number within each 'bin'
        # so new_idxs
        count_per_group=np.zeros(new_shape,'i4')
        intra_idx=np.zeros(Norig,'i4')
        for orig_idx,idxs in enumerate(zip(*new_idxs)):
            intra_idx[orig_idx] = count_per_group[idxs] 
            count_per_group[ idxs ]+=1
        n_intragroup=count_per_group.max() # 55 in the test case

        # add in the new dimension
        new_shape.append(n_intragroup)
        new_idxs.append(intra_idx)

    # negative means missing.  at this point, intragroup_dim has not been taken care of
    # mapper: array of the shape of the new dimensions, with each entry giving the linear
    # index into the original dimension
    mapper=np.zeros(new_shape,'i4') - 1
    mapper[ tuple(new_idxs) ] = np.arange(Norig)

    # install the new coordinates - first the grouped coordinates
    for nd,und in zip(new_dims,uni_new_dims):
        del ds[nd] # doesn't like replacing these in one go
        ds[nd]= ( (nd,), und )
    if intragroup_dim is not None:
        # and second the new intragroup coordinate:
        new_dims.append(intragroup_dim)
        ds[intragroup_dim] = ( (intragroup_dim,), np.arange(n_intragroup) )

    for vname in ds.data_vars:
        if lin_dim not in ds[vname].dims:
            # print("Skipping %s"%vname)
            continue
        # print(vname)

        var_new_dims=[]
        var_new_slice=[]
        mask_slice=[]
        for d in ds[vname].dims:
            if d==lin_dim:
                var_new_dims += new_dims
                var_new_slice.append( mapper )
                mask_slice.append( mapper<0 )
            else:
                var_new_dims.append(d)
                var_new_slice.append(slice(None))
                mask_slice.append(slice(None))
        var_new_dims=tuple(var_new_dims)
        var_new_slice=tuple(var_new_slice)

        # this is time x nSegment
        # ds[vname].values.shape # 10080,1494

        # This is the beast: but now it's including some crap values at the beginning
        new_vals=ds[vname].values[var_new_slice]
        mask=np.zeros_like(new_vals,'b1')
        mask[mask_slice] = True

        new_vals=np.ma.array(new_vals,mask=mask)

        old_attrs=OrderedDict(ds[vname].attrs)
        # This seems to be dropping the attributes
        ds[vname]=( var_new_dims, new_vals )
        for k in old_attrs:
            if k != '_FillValue':
                ds[vname].attrs[k] = old_attrs[k]

    if save_mapping:
        ds['mapping']= ( new_dims, mapper)

    return ds


[docs]def sort_dimension(ds,sort_var,sort_dim,inplace=False):
    """
    sort_var: variable whose value will be used to sort items along sort_dim.
    sort_dim must be in sort_var.dims
    only variables with dimensions the same or a superset of sort_var.dims
    can/will be sorted.
    """
    if not inplace:
        ds=ds.copy()
        
    #if ds[sort_var].ndim>1:
    # the interesting case
    # want to sort within each 'bin'
    
    sort_var_dims=ds[sort_var].dims
    sort_var_dimi = sort_var_dims.index(sort_dim)
    new_order=ds[sort_var].argsort(axis=sort_var_dimi).values

    # this only works for variables with a superset of sort_var's
    # dimensions (or the same).
    # i.e. depth(date,station,prof_sample)
    # can't be used to sort a variable of (station,prof_sample)
    # but it can be used to sort a variable of (analyte,date,station,prof_sample)
    for v in ds.data_vars:
        for d in sort_var_dims:
            compat=True
            if d not in ds[v].dims:
                # print("%s not compatible with dimensions for sorting"%v)
                compat=False
        if not compat: continue

        # build up transpose
        trans_dims=[]
        for d in ds[v].dims:
            if d not in sort_var_dims:
                trans_dims.append(d)
        n_extra=len(trans_dims)
        trans_dims+=sort_var_dims

        orig_dims=ds[v].dims
        tmp_trans=ds[v].transpose(*trans_dims)

        vals=tmp_trans.values
        # actually a tricky type of indexing to handle:
        # new_order.shape: (23, 37, 52)
        # luckily numpy knows how to do this relatively efficiently:
        idxs=np.ix_( *[np.arange(N) for N in vals.shape] )
        idxs=list(idxs)
        idxs[n_extra+sort_var_dimi]=new_order
        tmp_trans.values=vals[tuple(idxs)]
        ds[v].values=tmp_trans.transpose(*orig_dims)
    return ds


[docs]def first_finite(da,dim):
    # yecch.
    valid=np.isfinite(da.values)
    dimi=da.get_axis_num('prof_sample') 
    first_valid=np.argmax( valid, axis=dimi)
    new_shape=[ slice(length)
                for d,length in enumerate(da.shape)
                if d!=dimi ]
    indexers=np.ogrid[ tuple(new_shape) ]
    indexers[dimi:dimi]=[first_valid]

    da_reduced=da.isel(**{dim:0,'drop':True})
    da_reduced.values=da.values[tuple(indexers)]
    return da_reduced


# Helper for z_from_sigma
[docs]def decode_sigma(ds,sigma_v):
    """
    ds: Dataset
    sigma_v: sigma coordinate variable.
    return DataArray of z coordinate implied by sigma_v
    """
    import re
    formula_terms=sigma_v.attrs['formula_terms']
    terms={}
    for hit in re.findall(r'\s*(\w+)\s*:\s*(\w+)', formula_terms):
        terms[hit[0]]=ds[hit[1]]

    # this is where xarray really shines -- it will promote z to the
    # correct dimensions automatically, by name
    # This ordering of the multiplication puts laydim last, which is
    # assumed in some other [fragile] code.
    # a little shady, but its helpful to make the ordering here intentional
    z=(terms['eta'] - terms['bedlevel'])*terms['sigma'] + terms['bedlevel']

    return z


[docs]def z_from_sigma(dataset,variable,interfaces=False,dz=False):
    """
    Create a z coordinate for variable as a Dataset from the given dataset

    interfaces: False => do nothing related to layer boundaries
            variable name => use the given variable to define interfaces between layers.
            True => try to infer the variable, fallback to even spacing otherwise.
     if interfaces is anything other than False, then the return value will be a Dataset
     with the centers in a 'z_ctr' variable and the interfaces in a 'z_int'

    dz: implies interfaces, and includes a z_dz variable giving thickness of each layer.
    """
    da=dataset[variable]
    da_dims=da.dims

    if dz:
        assert interfaces is not False,"Currently must enable interfaces to get thickness dz"

    # Variables which are definitely sigma, and might be the one we're looking for
    sigma_vars=[v for v in dataset.variables
                if dataset[v].attrs.get('standard_name',None) == 'ocean_sigma_coordinate']

    # xr data arrays
    sigma_ctr_v=None # sigma coordinate for centers
    sigma_int_v=None # sigma coordinate for interfaces

    for v in sigma_vars:
        if set(dataset[v].dims)<=set(da_dims):
            assert sigma_ctr_v is None,"Multiple matches for layer center sigma coordinate"
            sigma_ctr_v=dataset[v]
    assert sigma_ctr_v is not None,"Failed to find a layer-center sigma coordinate"

    # With the layer center variable known, can search for layer interfaces
    if interfaces is False:
        pass
    else:
        if interfaces is True:
            maybe_int_vars=sigma_vars
        else:
            # Even when its specified, check to see that it has the expected form
            maybe_int_vars=[interfaces]

        for v in maybe_int_vars:
            ctr_dims=set(sigma_ctr_v.dims)
            int_dims=set(dataset[v].dims)

            ctr_only = list(ctr_dims - int_dims)
            int_only = list(int_dims - ctr_dims)

            if (len(ctr_only)!=1) or (len(int_only)!=1):
                continue
            if len(dataset[ctr_only[0]])+1==len(dataset[int_only[0]]):
                assert sigma_int_v is None,"Multiple matches for layer interface sigma coordinate"
                sigma_int_v=dataset[v]

    z_ctr=decode_sigma(dataset,sigma_ctr_v)
    if sigma_int_v is not None:
        z_int=decode_sigma(dataset,sigma_int_v)

    result=xr.Dataset()
    result['z_ctr']=z_ctr

    if interfaces is not False:
        result['z_int']=z_int
    if dz is not False:
        dz=xr.ones_like( z_ctr )
        dz.values[...]=np.diff( z_int, axis=z_int.get_axis_num(int_only[0]))
        result['z_dz']=dz

    return result


[docs]def bundle_components(ds,new_var,comp_vars,frame,comp_names=None):
    """
    ds: Dataset
    new_var: name of the vector-valued variable to create
    comp_vars: list of variables, one-per component
    frame: name to give the component dimension, i.e. the name of the
     reference frame
    comp_names: list same length as comp_vars, used to name the components.
    """
    vector=xr.concat([ds[v] for v in comp_vars],dim=frame)
    # That puts xy as the first dimension, but I'd rather it last
    dims=vector.dims
    roll_dims=dims[1:] + dims[:1]
    ds[new_var]=vector.transpose( *roll_dims )
    if comp_names is not None:
        ds[frame]=(frame,),comp_names



[docs]def concat_permissive(srcs,**kw):
    """
    Small wrapper around xr.concat which fills in nan 
    coordinates where they are missing, in case some
    of the incoming datasets have more metadata than others. 
    """
    extra_coords=set()
    for src in srcs:
        extra_coords |= set(src.coords)

    expanded_srcs=[]

    for src in srcs:
        for extra in extra_coords:
            if extra not in src:
                src=src.assign_coords(**{extra:np.nan})
        expanded_srcs.append(src)

    return xr.concat(expanded_srcs,**kw)

[docs]def structure_to_dataset(arr,dim,extra={}):
    """
    Convert a numpy structure array to a dataset.
    arr: structure array.
    dim: name of the array dimension.  can be a tuple with multiple dimension
      names if arr.ndim>1.
    extra: dict optionally mapping specific fields to additional dimensions
     within that field.
    """
    if isinstance(dim,six.string_types):
        dim=(dim,)
    ds=xr.Dataset()
    for fld in arr.dtype.names:
        if fld in extra:
            extra_dims=extra[fld]
        else:
            extra_dims=['d%02d'%d for d in arr[fld].shape[1:]]
        ds[fld]=dim+tuple(extra_dims),arr[fld]
    return ds
Source code for stompy.xr_utils

stompy

Navigation

Related Topics