Skip to content

Commit

Permalink
simpler column reindexing in DataFrame and DataMatrix, some cleanup f…
Browse files Browse the repository at this point in the history
…or release

git-svn-id: http://pandas.googlecode.com/svn/trunk@96 d5231056-7de3-11de-ac95-d976489f1ece
wesm committed Dec 25, 2009
1 parent b55f3a1 commit 77ad265
Showing 8 changed files with 250 additions and 218 deletions.
2 changes: 1 addition & 1 deletion pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# pylint: disable-msg=W0614,W0401,W0611
# pylint: disable-msg=W0614,W0401,W0611,W0622

__docformat__ = 'restructuredtext'

170 changes: 100 additions & 70 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
@@ -42,9 +42,11 @@ class DataFrame(Picklable, Groupable):
----------
data : dict
Mapping of column name --> array or Series/TimeSeries objects
index : array-like
Specific index to use for the Frame, Series will be conformed to this
if you provide it.
index : array-like, optional
Specific index to use for the Frame, Series will be conformed
to this if you provide it. If not input, index will be
inferred from input Series
columns : array-like, optional
Notes
-----
@@ -56,12 +58,12 @@ class DataFrame(Picklable, Groupable):
--------
DataMatrix: more efficient version of DataFrame for most operations
Example usage
-------------
Example
-------
>>> d = {'col1' : ts1, 'col2' : ts2}
>>> df = DataFrame(data=d, index=someIndex)
"""
def __init__(self, data=None, index=None):
def __init__(self, data=None, index=None, columns=None):
self._series = {}
if data is not None and len(data) > 0:
if index is None:
@@ -75,7 +77,7 @@ def __init__(self, data=None, index=None):

for k, v in data.iteritems():
if isinstance(v, Series):
# Forces homogoneity and copies data
# Forces homogeneity and copies data
self._series[k] = v.reindex(self.index)
else:
# Copies data and checks length
@@ -169,8 +171,8 @@ def fromDict(cls, inputDict=None, castFloat=True, **kwds):

def toDict(self):
"""
Simpler pseudo-inverse operation of dictToDataFrame, NaN values will be
included in the resulting dict-tree.
Simpler pseudo-inverse operation of DataFrame.fromDict, NaN
values will be included in the resulting dict-tree.
Return
------
@@ -316,9 +318,9 @@ def __setitem__(self, key, value):

def __delitem__(self, key):
"""
Delete column from DataFrame (only deletes the reference)
Delete column from DataFrame
"""
self._series.pop(key, None)
del self._series[key]

def pop(self, item):
"""
@@ -611,16 +613,16 @@ def append(self, otherFrame):

def asfreq(self, freq, fillMethod=None):
"""
Convert all TimeSeries inside to specified frequency using DateOffset
objects. Optionally provide fill method to pad/backfill/interpolate
missing values.
Convert all TimeSeries inside to specified frequency using
DateOffset objects. Optionally provide fill method to pad or
backfill missing values.
Parameters
----------
offset : DateOffset object, or string in {'WEEKDAY', 'EOM'}
DateOffset object or subclass (e.g. monthEnd)
fillMethod : {'backfill', 'pad', 'interpolate', None}
fillMethod : {'backfill', 'pad', None}
Method to use for filling holes in new inde
"""
if isinstance(freq, datetools.DateOffset):
@@ -886,38 +888,53 @@ def pivot(self, index=None, columns=None, values=None):

return _slow_pivot(self[index], self[columns], self[values])

def reindex(self, newIndex, fillMethod=None):
def reindex(self, index=None, columns=None, fillMethod=None):
"""
Reindex data inside, optionally filling according to some rule.
Parameters
----------
newIndex : array-like
index : array-like, optional
preferably an Index object (to avoid duplicating data)
fillMethod : {'backfill', 'pad', 'interpolate', None}
Method to use for filling holes in reindexed DataFrame
columns : array-like, optional
fillMethod : {'backfill', 'pad', None}
Method to use for filling data holes using the index
Returns
-------
y : same type as calling instance
"""
if self.index.equals(newIndex):
fillMethod = fillMethod.upper() if fillMethod else ''

if fillMethod not in ['BACKFILL', 'PAD', '']:
raise Exception("Don't recognize fillMethod: %s" % fillMethod)

frame = self

if index is not None:
frame = frame._reindex_index(index, fillMethod)

if columns is not None:
frame = frame._reindex_columns(columns)

return frame

def _reindex_index(self, index, method):
if self.index.equals(index):
return self.copy()

if len(newIndex) == 0:
if len(index) == 0:
return DataFrame(index=NULL_INDEX)

if not isinstance(newIndex, Index):
newIndex = Index(newIndex)
if not isinstance(index, Index):
index = Index(index)

if len(self.index) == 0:
return DataFrame(index=newIndex)
return DataFrame(index=index)

oldMap = self.index.indexMap
newMap = newIndex.indexMap

fillMethod = fillMethod.upper() if fillMethod else ''
if fillMethod not in ['BACKFILL', 'PAD', '']:
raise Exception("Don't recognize fillMethod: %s" % fillMethod)

fillVec, mask = tseries.getFillVec(self.index, newIndex, oldMap,
newMap, fillMethod)
fillVec, mask = tseries.getFillVec(self.index, index,
self.index.indexMap,
index.indexMap, method)

# Maybe this is a bit much? Wish I had unit tests...
typeHierarchy = [
@@ -938,14 +955,26 @@ def reindex(self, newIndex, fillMethod=None):
newSeries = {}
for col, series in self.iteritems():
series = series.view(np.ndarray)
for type, dest in typeHierarchy:
if issubclass(series.dtype.type, type):
for klass, dest in typeHierarchy:
if issubclass(series.dtype.type, klass):
new = series.take(fillVec).astype(dest)
new[-mask] = missingValue[dest]
newSeries[col] = new
break

return DataFrame(newSeries, index=newIndex)
return DataFrame(newSeries, index=index)

def _reindex_columns(self, columns):
if len(columns) == 0:
return DataFrame(index=self.index)

newFrame = self.filterItems(columns)

for col in columns:
if col not in newFrame:
newFrame[col] = NaN

return newFrame

@property
def T(self):
@@ -1000,7 +1029,7 @@ def shift(self, periods, offset=None, timeRule=None):
for col, series in self.iteritems()])
return DataFrame(data = newValues, index= newIndex)

def apply(self, func):
def apply(self, func, axis=0):
"""
Applies func to columns (Series) of this DataFrame and returns either
a DataFrame (if the function produces another series) or a Series
@@ -1011,6 +1040,7 @@ def apply(self, func):
----------
func : function
Function to apply to each column
axis : {0, 1}
Example
-------
@@ -1019,30 +1049,28 @@ def apply(self, func):
Note
----
Do NOT use functions that might toy with the index.
Functions altering the index are not supported (yet)
"""
if not len(self.cols()):
return self

results = {}
for col, series in self.iteritems():
result = func(series)
results[col] = result
if axis == 0:
target = self
elif axis == 1:
target = self.T

results = dict([(k, func(target[k])) for k in target.columns])

if hasattr(results.values()[0], '__iter__'):
return DataFrame(data=results, index=self.index)
else:
keyArray = np.asarray(sorted(set(results.keys())), dtype=object)
newIndex = Index(keyArray)

arr = np.array([results[idx] for idx in newIndex])
return Series(arr, index=newIndex)
return Series.fromDict(results)

def tapply(self, func):
"""
Apply func to the transposed DataFrame, results as per apply
"""
return self.T.apply(func)
return self.apply(func, axis=1)

def applymap(self, func):
"""
@@ -1323,8 +1351,8 @@ def plot(self, kind='line', **kwds):
Plot the DataFrame's series with the index on the x-axis using
matplotlib / pylab.
Params
------
Parameters
----------
kind : {'line', 'bar', 'hist'}
Default: line for TimeSeries, hist for Series
@@ -1414,10 +1442,7 @@ def sum(self, axis=0, asarray=False):
theCount = self.count(axis)
theSum[theCount == 0] = NaN
except Exception:
if axis == 0:
theSum = self.apply(np.sum)
else:
theSum = self.tapply(np.sum)
theSum = self.apply(np.sum, axis=axis)

if asarray:
return theSum
@@ -1428,6 +1453,27 @@ def sum(self, axis=0, asarray=False):
else:
raise Exception('Must have 0<= axis <= 1')

def cumsum(self, axis=0):
"""
Return cumulative sum over requested axis as DataFrame
Parameters
----------
axis : {0, 1}
0 for row-wise, 1 for column-wise
Returns
-------
y : DataFrame
"""
def get_cumsum(y):
y = np.array(y)
if not issubclass(y.dtype.type, np.int_):
y[np.isnan(y)] = 0
return y.cumsum()

return self.apply(get_cumsum, axis=axis)

def product(self, axis=0, asarray=False):
"""
Return array or Series of products over requested axis.
@@ -1664,22 +1710,6 @@ def skew(self, axis=0, asarray=False):
else:
raise Exception('Must have 0<= axis <= 1')

def _withColumns(self, newCols):
"""
Utility method, force values matrix to have particular columns
Can make this as cute as we like
"""
if len(newCols) == 0:
return DataFrame(index=self.index)

newFrame = self.filterItems(newCols)

for col in newCols:
if col not in newFrame:
newFrame[col] = NaN

return newFrame

def _pfixed(s, space, nanRep=None):
if isinstance(s, float):
fstring = '%-' + str(space-4) + 'g'
Loading

0 comments on commit 77ad265

Please sign in to comment.