#!/usr/bin/env python
""" test docstring """
from __future__ import print_function
import re
from itertools import chain
from operator import itemgetter
import numpy as np
import viscid
from viscid import logger
from viscid.compat import string_types
from viscid.bucket import Bucket
from viscid.grid import Grid
from viscid import tree
from viscid import vutil
from viscid.vutil import tree_prefix
from viscid.sliceutil import standardize_sel, std_sel2index, selection2values
__all__ = ['to_dataframe', 'from_dataframe']
[docs]def to_dataframe(collection, fld_names=None, selection=Ellipsis,
time_sel=slice(None), time_col='time', datetime_col='datetime'):
"""Consolidate field collection into pandas dataframe
Args:
collection (sequence): Can be one of (Field, List[Field],
Dataset, Grid)
fld_names (sequence, None): grab specific fields by name,
or None to grab all fields
selection (selection): optional spatial selection
time (selection): optional time selection
Returns:
pandas.DataFrame
"""
if not hasattr(collection, 'to_dataframe'):
if not isinstance(collection, (list, tuple)):
collection = [collection]
collection_dict = {}
for fld in collection:
if fld.time in collection_dict:
collection_dict[fld.time].append(fld)
else:
collection_dict[fld.time] = [fld]
dset = DatasetTemporal()
for t in sorted(list(collection_dict.keys())):
fld_list = collection_dict[t]
grid = Grid()
grid.crds = fld_list[0].crds
grid.time = t
grid.basetime = fld_list[0].basetime
grid.add_field(*fld_list)
dset.add(grid)
if len(collection_dict) == 1:
collection = dset.get_grid()
else:
collection = dset
frame = collection.to_dataframe(fld_names=fld_names, selection=selection,
time_sel=time_sel, time_col=time_col,
datetime_col=datetime_col)
return frame
[docs]def from_dataframe(frame, crd_cols=None, time_col='time', datetime_col='datetime'):
"""Make either a DatasetTemporal or Grid from pandas dataframe
Args:
frame (pandas.DataFrame): frame to parse
crd_cols (List[Str], None): list of column names for coordinates
time_col (str): column name of times
datetime_col (str): column name of datetimes
Returns:
DatasetTemporal or Grid
Raises:
ValueError: if only 1 row given and crd_cols is None
"""
import pandas
# discover times and possible basetime
try:
unique_times = frame[time_col].drop_duplicates()
if 'datetime' in frame:
unique_datetimes = frame[datetime_col].drop_duplicates()
if len(unique_times) > 1:
dt_datetime = unique_datetimes.iloc[1] - unique_datetimes.iloc[0]
dt_time = unique_times.iloc[1] - unique_times.iloc[0]
t0_timedelta = unique_times.iloc[0] * (dt_datetime / dt_time)
else:
t0_timedelta = viscid.as_timedelta64(1e6 * unique_times.iloc[0], 'us')
basetime = unique_datetimes.iloc[0] - t0_timedelta
else:
basetime = None
frame0 = frame[frame[time_col] == unique_times[0]]
except KeyError:
unique_times = np.array([0.0])
basetime = None
frame0 = frame
# discover crd_cols if not given
if crd_cols is None:
frame1 = frame0.drop([time_col, datetime_col], axis=1, errors='ignore')
if len(frame1) <= 1:
raise ValueError("With only 1 row, crd_cols must be specified.")
for icol in range(frame1.shape[1]):
diff = frame1.iloc[1, icol] - frame1.iloc[0, icol]
if diff != np.zeros((1,), dtype=diff.dtype):
break
crd_cols = frame1.columns[:icol + 1]
# discover field shape and make coordinates
crd_arrs = [frame[col].drop_duplicates() for col in crd_cols]
shape = [len(arr) for arr in crd_arrs]
crds = viscid.arrays2crds(crd_arrs, crd_names=crd_cols)
fld_names = list(frame.columns)
for _col in [time_col, datetime_col] + list(crd_cols):
if _col in fld_names:
fld_names.remove(_col)
# wrap everything up into grids
grids = []
for time in unique_times:
grid = Grid()
grid.time = time
grid.basetime = basetime
try:
frame1 = frame[frame[time_col] == time]
except KeyError:
frame1 = frame
for name in fld_names:
arr = frame1[name].values.reshape(shape)
fld = viscid.wrap_field(arr, crds, name=name, center='node')
grid.add_field(fld)
grids.append(grid)
if len(grids) > 1:
ret = DatasetTemporal()
for grid in grids:
ret.add(grid)
ret.basetime = basetime
else:
ret = grids[0]
return ret
class DeferredChild(object):
def __init__(self, callback, callback_args, callback_kwargs, parent=None,
name='NoName', time=0.0):
self.callback = callback
self.callback_args = callback_args if callback_args else ()
self.callback_kwargs = callback_kwargs if callback_kwargs else {}
self.parents = []
if parent is not None:
self.parents.append(parent)
self.name = name
self.time = time
def resolve(self):
ret = self.callback(*self.callback_args, **self.callback_kwargs)
if self.parents:
# this is a little kludgy, but at the moment, prepare_child
# is only used to add the parent to the list of a child's parents
self.parents[0].prepare_child(ret)
return ret
def clear_cache(self):
pass
def remove_all_items(self):
pass
def print_tree(self, depth=-1, prefix=""):
print('{0}{1}'.format(prefix, self))
class Dataset(tree.Node):
"""Datasets contain grids or other datasets
Note:
Datasets should probably be created using a vfile's
`_make_dataset` to make sure the info dict is propogated
appropriately
It is the programmer's responsibility to ensure objects added to a AOEUIDH
dataset have __getitem__ and get_fields methods, this is not
enforced
"""
children = None # Bucket or (time, grid)
active_child = None
topology_info = None
geometry_info = None
crds = None
def __init__(self, *args, **kwargs):
"""info is for information that is shared for a whole
tree, from vfile all the way down to fields
"""
super(Dataset, self).__init__(**kwargs)
if self.children is None:
self.children = Bucket(ordered=True)
self.active_child = None
for arg in args:
self.add(arg)
def add(self, child, set_active=True):
self.prepare_child(child)
self.children[child.name] = child
if set_active:
self.active_child = child
def add_deferred(self, key, callback, callback_args=None,
callback_kwargs=None, set_active=True):
child = DeferredChild(callback, callback_args=callback_args,
callback_kwargs=callback_kwargs,
parent=self, name=key)
self.add(child, set_active=set_active)
def _clear_cache(self):
for child in self.children:
child.clear_cache()
def clear_cache(self):
"""Clear all childrens' caches"""
self._clear_cache()
def remove_all_items(self):
for child in self.children:
self.tear_down_child(child)
child.remove_all_items()
self.children = Bucket(ordered=True)
def activate(self, child_handle):
""" it may not look like it, but this will recursively look
in my active child for the handle because it uses getitem """
self.active_child = self.children[child_handle]
def activate_time(self, time):
""" this is basically 'activate' except it specifically picks out
temporal datasets, and does all children, not just the active child """
for child in self.children:
try:
child.activate_time(time)
except AttributeError:
pass
def nr_times(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100):
for child in self.children:
try:
return child.nr_times(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol)
except AttributeError:
pass
raise RuntimeError("I find no temporal datasets")
def iter_times(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100, resolved=True):
for child in self.iter_resolved_children():
try:
return child.iter_times(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol,
resolved=resolved)
except AttributeError:
pass
raise RuntimeError("I find no temporal datasets")
def tslc_range(self, sel=slice(None), tdunit='s'):
"""Find endpoints for a time slice selection
Note:
If the selection is slice-by-location, the values are not
adjusted to the nearest frame. For this functionality,
you will want to use :py:func:`get_times` and pull out the
first and last values.
"""
for child in self.children:
try:
return child.tslc_range(sel=sel, tdunit=tdunit)
except AttributeError:
pass
raise RuntimeError("I find no temporal datasets")
def get_times(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100):
return list(self.iter_times(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol,
resolved=False))
def get_time(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100):
try:
return next(self.iter_times(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol))
except StopIteration:
raise RuntimeError("Dataset has no time slices")
def to_dataframe(self, fld_names=None, selection=Ellipsis,
time_sel=slice(None), time_col='time',
datetime_col='datetime'):
"""Consolidate grid's field data into pandas dataframe
Args:
fld_names (sequence, None): grab specific fields by name,
or None to grab all fields
selection (selection): optional spatial selection
time (selection): optional time selection
Returns:
pandas.DataFrame
"""
# deferred import so that viscid does not depend on pandas
import pandas
frames = [child.to_dataframe(fld_names=fld_names, selection=selection,
time_sel=time_sel, time_col=time_col,
datetime_col=datetime_col)
for child in self.children]
frame = pandas.concat(frames, ignore_index=True, sort=False)
# make sure crds are all at the beginning, since concat can reorder them
col0 = list(frames[0].columns)
frame = frame[col0 + list(set(frame.columns) - set(col0))]
return frame
def iter_fields(self, time=None, fld_names=None):
""" generator for fields in the active dataset,
this will recurse down to a grid """
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.iter_fields(time=time, fld_names=fld_names)
def iter_field_items(self, time=None, fld_names=None):
""" generator for (name, field) in the active dataset,
this will recurse down to a grid """
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.iter_field_items(time=time, fld_names=fld_names)
def field_dict(self, time=None, fld_names=None, **kwargs):
""" fields as dict of {name: field} """
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.field_dict(time=time, fld_names=fld_names)
def print_tree(self, depth=-1, prefix=""):
if prefix == "":
print(self)
prefix += tree_prefix
for child in self.children:
suffix = ""
if child is self.active_child:
suffix = " <-- active"
print("{0}{1}{2}".format(prefix, child, suffix))
if depth != 0:
child.print_tree(depth=depth - 1, prefix=prefix + tree_prefix)
# def get_non_dataset(self):
# """ recurse down datasets until active_grid is not a subclass
# of Dataset """
# if isinstance(self.activate_grid, Dataset):
# return self.active_grid.get_non_dataset()
# else:
# return self.active_grid
def get_field(self, fldname, time=None, slc=Ellipsis):
""" recurse down active children to get a field """
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.get_field(fldname, time=time, slc=slc)
def get_grid(self, time=None):
""" recurse down active children to get a field """
child = self.active_child.resolve()
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.get_grid(time=time)
def get_child(self, item):
""" get a child from this Dataset, """
return self.children[item].resolve()
def __getitem__(self, item):
""" if a child exists with handle, return it, else ask
the active child if it knows what you want """
if item in self.children:
return self.get_child(item)
elif self.active_child is not None:
return self.active_child[item]
else:
raise KeyError()
def __delitem__(self, item):
# FIXME, is it possable to de-resolve item to a DeferredChild?
child = self.get_child(item)
child.clear_cache()
self.children.remove_item(child)
def __len__(self):
return self.children.__len__()
def __setitem__(self, name, child):
# um... is this kosher??
child.name = name
self.add(child)
def __contains__(self, item):
# FIXME, is it possable to de-resolve item to a DeferredChild?
if item in self.children:
return True
# FIXME: this might cause a bug somewhere someday
if item in self.active_child:
return True
return False
def __enter__(self):
return self
def __exit__(self, exc_type, value, traceback):
self.clear_cache()
return None
def __iter__(self):
return self.iter_resolved_children()
# def __next__(self):
# raise NotImplementedError()
class DatasetTemporal(Dataset):
"""
Note:
Datasets should probably be created using a vfile's
`_make_dataset` to make sure the info dict is propogated
appropriately
"""
_last_ind = 0
# _all_times = None
def __init__(self, *args, **kwargs):
# ok, i want more control over my childen than a bucket can give
# FIXME: this is kind of not cool since children is a public
# attribute yet it's a different type here
self.children = []
super(DatasetTemporal, self).__init__(*args, **kwargs)
def add(self, child, set_active=True):
if child is None:
raise RuntimeError()
if child.time is None:
child.time = 0.0
logger.error("A child with no time? Something is strange...")
# this keeps the children in time order
self.prepare_child(child)
self.children.append((child.time, child))
self.children.sort(key=itemgetter(0))
# binary in sorting... maybe more efficient?
# bisect.insort(self.children, (child.time, child))
if set_active:
self.active_child = child
def add_deferred(self, time, callback, callback_args=None,
callback_kwargs=None, set_active=True):
child = DeferredChild(callback, callback_args=callback_args,
callback_kwargs=callback_kwargs,
parent=self, time=time)
self.add(child, set_active=set_active)
def remove_all_items(self):
for child in self.children:
self.tear_down_child(child[1])
child[1].remove_all_items()
self.children = []
def clear_cache(self):
"""Clear all childrens' caches"""
for child in self.children:
child[1].clear_cache()
def activate(self, time):
self.active_child = self.get_child(time)
def activate_time(self, time):
""" this is basically 'activate' except it specifically picks out
temporal datasets """
self.activate(time)
#############################################################################
## here begins a slew of functions that make specifying a time / time slice
## super general
def _slice_time(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100):
"""
Args:
slc (str, slice, list): can be a single string containing
slices, slices, ints, floats, datetime objects, or a
list of any of the above.
Returns:
list of slices (containing integers only) or ints
"""
times = np.array([child[0] for child in self.children])
try:
basetime = self.basetime
except viscid.NoBasetimeError:
basetime = None
std_sel = standardize_sel(sel)
idx_sel = std_sel2index(std_sel, times, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, epoch=basetime)
return idx_sel
def _time_slice_to_iterator(self, slc):
"""
Args:
slc: a slice (containing ints only) or an int, or a list
of any of the above
Returns:
a flat iterator of self.children of all the slices chained
"""
inds = np.arange(len(self.children))[slc]
if not isinstance(inds, np.ndarray):
inds = np.asarray(inds).reshape(-1)
return (self.children[i] for i in inds)
def nr_times(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100):
slc = self._slice_time(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol)
child_iterator = self._time_slice_to_iterator(slc)
return len(list(child_iterator))
def iter_times(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100, resolved=True):
slc = self._slice_time(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol)
child_iterator = self._time_slice_to_iterator(slc)
for child in child_iterator:
# FIXME: this isn't general, but so far the only files we're
# read have only contained one Grid / AMRGrid. Without get_grid()
# here, the context manager will unload the file when done, but
# that's not what we wanted here, we wanted to just clear caches
if resolved:
what = child[1].resolve().get_grid()
else:
what = child[1]
with what as target:
yield target
def get_times(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100):
return list(self.iter_times(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol,
resolved=False))
def get_time(self, sel=slice(None), val_endpoint=True, interior=False,
tdunit='s', tol=100):
return self.get_times(sel=sel, val_endpoint=val_endpoint,
interior=interior, tdunit=tdunit, tol=tol)[0]
def tslc_range(self, sel=slice(None), tdunit='s'):
"""Find endpoints for a time slice selection
Note:
If the selection is slice-by-location, the values are not
adjusted to the nearest frame. For this functionality,
you will want to use :py:func:`get_times` and pull out the
first and last values.
"""
times = np.array([child[0] for child in self.children])
try:
basetime = self.basetime
except viscid.NoBasetimeError:
basetime = None
return selection2values(times, sel, epoch=basetime, tdunit=tdunit)
## ok, that's enough for the time stuff
########################################
def to_dataframe(self, fld_names=None, selection=Ellipsis,
time_sel=slice(None), time_col='time',
datetime_col='datetime'):
"""Consolidate grid's field data into pandas dataframe
Args:
fld_names (sequence, None): grab specific fields by name,
or None to grab all fields
selection (selection): optional spatial selection
time (selection): optional time selection
Returns:
pandas.DataFrame
"""
# deferred import so that viscid does not depend on pandas
import pandas
frames = [child.to_dataframe(fld_names=fld_names, selection=selection,
time_sel=time_sel, time_col=time_col,
datetime_col=datetime_col)
for child in self.iter_times(sel=time_sel)]
frame = pandas.concat(frames, ignore_index=True, sort=False)
# make sure crds are all at the beginning, since concat can reorder them
col0 = list(frames[0].columns)
frame = frame[col0 + list(set(frame.columns) - set(col0))]
return frame
def iter_fields(self, time=None, fld_names=None):
""" generator for fields in the active dataset,
this will recurse down to a grid """
if time is not None:
child = self.get_child(time)
else:
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.iter_fields(time=time, fld_names=fld_names)
def iter_field_items(self, time=None, fld_names=None):
""" generator for (name, field) in the active dataset,
this will recurse down to a grid """
if time is not None:
child = self.get_child(time)
else:
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.iter_field_items(time=time, fld_names=fld_names)
def field_dict(self, time=None, fld_names=None):
""" fields as dict of {name: field} """
if time is not None:
child = self.get_child(time)
else:
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.field_dict(fld_names=fld_names)
def print_tree(self, depth=-1, prefix=""):
if prefix == "":
print(self)
prefix += tree_prefix
for child in self.children:
suffix = ""
if child[1] is self.active_child:
suffix = " <-- active"
print("{0}{1} (t={2}){3}".format(prefix, child, child[0], suffix))
if depth != 0:
child[1].print_tree(depth=depth - 1, prefix=prefix + tree_prefix)
def get_field(self, fldname, time=None, slc=Ellipsis):
""" recurse down active children to get a field """
if time is not None:
child = self.get_child(time)
else:
child = self.active_child
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.get_field(fldname, time=time, slc=slc)
def get_grid(self, time=None):
""" recurse down active children to get a field """
if time is not None:
child = self.get_child(time)
else:
child = self.active_child.resolve()
if child is None:
logger.error("Could not get appropriate child...")
return None
else:
return child.get_grid(time=time)
def get_child(self, item):
""" if item is an int and < len(children), it is an index in a list,
else I will find the cloest time to float(item) """
# print(">> get_child:", item)
# print(">> slice is:", self._slice_time(item))
# always just return the first slice's child... is this wrong?
child = self.children[self._slice_time(item)][1].resolve()
return child
def __contains__(self, item):
if isinstance(item, int) and item > 0 and item < len(self.children):
return True
if isinstance(item, string_types) and item[-1] == 'f':
try:
val = float(item[:-1])
if val >= self.children[0][0] and val <= self.children[-1][0]:
return True
else:
return False
except ValueError:
pass
return item in self.active_child
def iter_resolved_children(self):
return (child[1].resolve() for child in self.children)