import logging
log=logging.getLogger('xr_utils')
from collections import OrderedDict
import xarray as xr
import six
import numpy as np
[docs]def gradient(ds,varname,coord):
# rather than assume that consecutive data points are valid,
# fit a line to the data values per water column
daC,daz = xr.broadcast(ds[varname],ds[coord])
z=daz.values
C=daC.values
assert z.shape==C.shape
newdims=[dim for dim in daC.dims if dim!=coord]
newshape=[len(daC[dim]) for dim in newdims]
newdims,newshape
result=np.zeros(newshape,'f8')
for idx in np.ndindex(*newshape):
colC=C[idx]
colz=z[idx]
assert colC.ndim==colz.ndim==1 # sanity
valid=np.isfinite(colC*colz)
if np.sum(valid)>1:
mb=np.polyfit(colz[valid],colC[valid],1)
result[idx]=mb[0]
else:
result[idx]=np.nan
return xr.DataArray(result,coords=[ds[dim] for dim in newdims],name='d%s/d%s'%(varname,coord))
[docs]def find_var(nc,pred):
for fld in nc:
try:
if pred(nc[fld]):
return fld
except:
pass
return None
[docs]def redimension(ds,new_dims,
intragroup_dim=None,
inplace=False,
save_mapping=False):
"""
copy ds, making new_dims into the defining
dimensions for variables which share shape with new_dims.
each entry in new_dims must have the same dimension, and
must be unidimensional
Example:
Dataset:
coordinates
sample [0,1,2..100]
data variables
date(sample) [...]
station(sample) [...]
depth(sample) [...]
salinity(sample) [...]
We'd like to end up with
salinity(date,station,profile_sample)
depth(date,station,profile_sample)
Or
Dataset:
coordinates
time [...]
item [...]
data variables
x(item) [...]
y(item) [...]
z(item) [...]
salinity(time,time) [...,...]
Which you want to become
Dataset:
coordinates
time [.]
x [.]
y [.]
zi [.]
data variables
z(x,y,zi) [...]
salinity(time,x,y,zi) [....]
In other words, replace item with three orthogonal dimensions. Two of the
orthogonal dimensions have existing coordinates, and the third is an index
to elements within the bin defined by x,y.
save_mapping: create an additional variable in the output which stores the
mapping of the linear dimension to the new, orthogonal dimensions
intragroup_dim: introduce an additional dimension to enumerate the original
data which map to the same new coordinates.
"""
if not inplace:
ds=ds.copy()
lin_dim=ds[new_dims[0]].dims[0]# the original linear dimension
orig_dims=[ ds[vname].values.copy()
for vname in new_dims ]
Norig=len(orig_dims[0]) # length of the original, linear dimension
uni_new_dims=[ np.unique(od) for od in orig_dims]
for und in uni_new_dims:
try:
if np.any(und<0):
log.warning("New dimensions have negative values -- will continue but you probably want to drop those first")
except TypeError:
# some versions of numpy/xarray will not compare times to 0,
# triggering a TypeError
pass
# note that this is just the shape that will replace occurences of lin_dim
new_shape=[len(und) for und in uni_new_dims]
# build up an index array
new_idxs=[ np.searchsorted(und,od)
for und,od in zip( uni_new_dims, orig_dims ) ]
if intragroup_dim is not None:
# here we need to first count up the max number within each 'bin'
# so new_idxs
count_per_group=np.zeros(new_shape,'i4')
intra_idx=np.zeros(Norig,'i4')
for orig_idx,idxs in enumerate(zip(*new_idxs)):
intra_idx[orig_idx] = count_per_group[idxs]
count_per_group[ idxs ]+=1
n_intragroup=count_per_group.max() # 55 in the test case
# add in the new dimension
new_shape.append(n_intragroup)
new_idxs.append(intra_idx)
# negative means missing. at this point, intragroup_dim has not been taken care of
# mapper: array of the shape of the new dimensions, with each entry giving the linear
# index into the original dimension
mapper=np.zeros(new_shape,'i4') - 1
mapper[ tuple(new_idxs) ] = np.arange(Norig)
# install the new coordinates - first the grouped coordinates
for nd,und in zip(new_dims,uni_new_dims):
del ds[nd] # doesn't like replacing these in one go
ds[nd]= ( (nd,), und )
if intragroup_dim is not None:
# and second the new intragroup coordinate:
new_dims.append(intragroup_dim)
ds[intragroup_dim] = ( (intragroup_dim,), np.arange(n_intragroup) )
for vname in ds.data_vars:
if lin_dim not in ds[vname].dims:
# print("Skipping %s"%vname)
continue
# print(vname)
var_new_dims=[]
var_new_slice=[]
mask_slice=[]
for d in ds[vname].dims:
if d==lin_dim:
var_new_dims += new_dims
var_new_slice.append( mapper )
mask_slice.append( mapper<0 )
else:
var_new_dims.append(d)
var_new_slice.append(slice(None))
mask_slice.append(slice(None))
var_new_dims=tuple(var_new_dims)
var_new_slice=tuple(var_new_slice)
# this is time x nSegment
# ds[vname].values.shape # 10080,1494
# This is the beast: but now it's including some crap values at the beginning
new_vals=ds[vname].values[var_new_slice]
mask=np.zeros_like(new_vals,'b1')
mask[mask_slice] = True
new_vals=np.ma.array(new_vals,mask=mask)
old_attrs=OrderedDict(ds[vname].attrs)
# This seems to be dropping the attributes
ds[vname]=( var_new_dims, new_vals )
for k in old_attrs:
if k != '_FillValue':
ds[vname].attrs[k] = old_attrs[k]
if save_mapping:
ds['mapping']= ( new_dims, mapper)
return ds
[docs]def sort_dimension(ds,sort_var,sort_dim,inplace=False):
"""
sort_var: variable whose value will be used to sort items along sort_dim.
sort_dim must be in sort_var.dims
only variables with dimensions the same or a superset of sort_var.dims
can/will be sorted.
"""
if not inplace:
ds=ds.copy()
#if ds[sort_var].ndim>1:
# the interesting case
# want to sort within each 'bin'
sort_var_dims=ds[sort_var].dims
sort_var_dimi = sort_var_dims.index(sort_dim)
new_order=ds[sort_var].argsort(axis=sort_var_dimi).values
# this only works for variables with a superset of sort_var's
# dimensions (or the same).
# i.e. depth(date,station,prof_sample)
# can't be used to sort a variable of (station,prof_sample)
# but it can be used to sort a variable of (analyte,date,station,prof_sample)
for v in ds.data_vars:
for d in sort_var_dims:
compat=True
if d not in ds[v].dims:
# print("%s not compatible with dimensions for sorting"%v)
compat=False
if not compat: continue
# build up transpose
trans_dims=[]
for d in ds[v].dims:
if d not in sort_var_dims:
trans_dims.append(d)
n_extra=len(trans_dims)
trans_dims+=sort_var_dims
orig_dims=ds[v].dims
tmp_trans=ds[v].transpose(*trans_dims)
vals=tmp_trans.values
# actually a tricky type of indexing to handle:
# new_order.shape: (23, 37, 52)
# luckily numpy knows how to do this relatively efficiently:
idxs=np.ix_( *[np.arange(N) for N in vals.shape] )
idxs=list(idxs)
idxs[n_extra+sort_var_dimi]=new_order
tmp_trans.values=vals[tuple(idxs)]
ds[v].values=tmp_trans.transpose(*orig_dims)
return ds
[docs]def first_finite(da,dim):
# yecch.
valid=np.isfinite(da.values)
dimi=da.get_axis_num('prof_sample')
first_valid=np.argmax( valid, axis=dimi)
new_shape=[ slice(length)
for d,length in enumerate(da.shape)
if d!=dimi ]
indexers=np.ogrid[ tuple(new_shape) ]
indexers[dimi:dimi]=[first_valid]
da_reduced=da.isel(**{dim:0,'drop':True})
da_reduced.values=da.values[tuple(indexers)]
return da_reduced
# Helper for z_from_sigma
[docs]def decode_sigma(ds,sigma_v):
"""
ds: Dataset
sigma_v: sigma coordinate variable.
return DataArray of z coordinate implied by sigma_v
"""
import re
formula_terms=sigma_v.attrs['formula_terms']
terms={}
for hit in re.findall(r'\s*(\w+)\s*:\s*(\w+)', formula_terms):
terms[hit[0]]=ds[hit[1]]
# this is where xarray really shines -- it will promote z to the
# correct dimensions automatically, by name
# This ordering of the multiplication puts laydim last, which is
# assumed in some other [fragile] code.
# a little shady, but its helpful to make the ordering here intentional
z=(terms['eta'] - terms['bedlevel'])*terms['sigma'] + terms['bedlevel']
return z
[docs]def z_from_sigma(dataset,variable,interfaces=False,dz=False):
"""
Create a z coordinate for variable as a Dataset from the given dataset
interfaces: False => do nothing related to layer boundaries
variable name => use the given variable to define interfaces between layers.
True => try to infer the variable, fallback to even spacing otherwise.
if interfaces is anything other than False, then the return value will be a Dataset
with the centers in a 'z_ctr' variable and the interfaces in a 'z_int'
dz: implies interfaces, and includes a z_dz variable giving thickness of each layer.
"""
da=dataset[variable]
da_dims=da.dims
if dz:
assert interfaces is not False,"Currently must enable interfaces to get thickness dz"
# Variables which are definitely sigma, and might be the one we're looking for
sigma_vars=[v for v in dataset.variables
if dataset[v].attrs.get('standard_name',None) == 'ocean_sigma_coordinate']
# xr data arrays
sigma_ctr_v=None # sigma coordinate for centers
sigma_int_v=None # sigma coordinate for interfaces
for v in sigma_vars:
if set(dataset[v].dims)<=set(da_dims):
assert sigma_ctr_v is None,"Multiple matches for layer center sigma coordinate"
sigma_ctr_v=dataset[v]
assert sigma_ctr_v is not None,"Failed to find a layer-center sigma coordinate"
# With the layer center variable known, can search for layer interfaces
if interfaces is False:
pass
else:
if interfaces is True:
maybe_int_vars=sigma_vars
else:
# Even when its specified, check to see that it has the expected form
maybe_int_vars=[interfaces]
for v in maybe_int_vars:
ctr_dims=set(sigma_ctr_v.dims)
int_dims=set(dataset[v].dims)
ctr_only = list(ctr_dims - int_dims)
int_only = list(int_dims - ctr_dims)
if (len(ctr_only)!=1) or (len(int_only)!=1):
continue
if len(dataset[ctr_only[0]])+1==len(dataset[int_only[0]]):
assert sigma_int_v is None,"Multiple matches for layer interface sigma coordinate"
sigma_int_v=dataset[v]
z_ctr=decode_sigma(dataset,sigma_ctr_v)
if sigma_int_v is not None:
z_int=decode_sigma(dataset,sigma_int_v)
result=xr.Dataset()
result['z_ctr']=z_ctr
if interfaces is not False:
result['z_int']=z_int
if dz is not False:
dz=xr.ones_like( z_ctr )
dz.values[...]=np.diff( z_int, axis=z_int.get_axis_num(int_only[0]))
result['z_dz']=dz
return result
[docs]def bundle_components(ds,new_var,comp_vars,frame,comp_names=None):
"""
ds: Dataset
new_var: name of the vector-valued variable to create
comp_vars: list of variables, one-per component
frame: name to give the component dimension, i.e. the name of the
reference frame
comp_names: list same length as comp_vars, used to name the components.
"""
vector=xr.concat([ds[v] for v in comp_vars],dim=frame)
# That puts xy as the first dimension, but I'd rather it last
dims=vector.dims
roll_dims=dims[1:] + dims[:1]
ds[new_var]=vector.transpose( *roll_dims )
if comp_names is not None:
ds[frame]=(frame,),comp_names
[docs]def concat_permissive(srcs,**kw):
"""
Small wrapper around xr.concat which fills in nan
coordinates where they are missing, in case some
of the incoming datasets have more metadata than others.
"""
extra_coords=set()
for src in srcs:
extra_coords |= set(src.coords)
expanded_srcs=[]
for src in srcs:
for extra in extra_coords:
if extra not in src:
src=src.assign_coords(**{extra:np.nan})
expanded_srcs.append(src)
return xr.concat(expanded_srcs,**kw)
[docs]def structure_to_dataset(arr,dim,extra={}):
"""
Convert a numpy structure array to a dataset.
arr: structure array.
dim: name of the array dimension. can be a tuple with multiple dimension
names if arr.ndim>1.
extra: dict optionally mapping specific fields to additional dimensions
within that field.
"""
if isinstance(dim,six.string_types):
dim=(dim,)
ds=xr.Dataset()
for fld in arr.dtype.names:
if fld in extra:
extra_dims=extra[fld]
else:
extra_dims=['d%02d'%d for d in arr[fld].shape[1:]]
ds[fld]=dim+tuple(extra_dims),arr[fld]
return ds