Dealing with Pandas's nullable float dtypes
Pandas added support for nullable float32 and float64 datatypes in the past few years (Float32 and Float64 respectively)1, but there's a lot of footguns, so it feels kind of bolted on. One example is that while arrays with this dtype can have both np.nan and pd.NA (aka None) as values, Series.isna() only catches the latter not the former. This is a known bug, but there's no fix yet and it might take until Pandas 3.0 before one comes.
You can jump straight to the solution.
Here's an example provided by @cmillani:
import numpy as np
import pandas as pd
df = pd.DataFrame({'a': [5, 0], 'b': [np.nan, 12]})
df = df.astype('float64')
df['c'] = df['a'] * np.inf
df.isna()
a b c
0 False True False
1 False False True
Note that casting to Float64 does change np.nans to pd.NA:
df = df.astype('Float64')
df
a b c
0 5.0 <NA> inf
1 0.0 12.0 <NA>
However, now df.isna() doesn't catch newly introduced np.nans:
df['c'] = df['a'] * np.inf
df.isna()
a b c
0 False True False
1 False False False
Fix
In the meantime, you can fix it for yourself using the following monkey patch:
import numpy as np
from pandas.core.arrays.floating import FloatingArray
FloatingArray.oldisna = FloatingArray.isna
def newisna(self: FloatingArray) -> np.ndarray:
return np.isnan(self._data) | self._mask.copy()
FloatingArray.isna = newisna
df.isna()
a b c
0 False True False
1 False False True
You'll want to patch fillna, too, probably:
from pandas.core.arrays.masked import BaseMaskedArrayT, validate_fillna_kwargs, is_array_like, missing
FloatingArray.oldfillna = FloatingArray.fillna
def newfillna(
self: BaseMaskedArrayT, value=None, method=None, limit=None
) -> BaseMaskedArrayT:
value, method = validate_fillna_kwargs(value, method)
mask = (self._mask | np.isnan(self._data)).copy()
if is_array_like(value):
if len(value) != len(self):
raise ValueError(
f"Length of 'value' does not match. Got ({len(value)}) "
f" expected {len(self)}"
)
value = value[mask]
if mask.any():
if method is not None:
func = missing.get_fill_func(method, ndim=self.ndim)
npvalues = self._data.copy().T
new_mask = mask.T
func(npvalues, limit=limit, mask=new_mask)
return type(self)(npvalues.T, new_mask.T)
else:
# fill with value
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
return new_values
FloatingArray.fillna = newfillna
There might be more efficient ways (though I think fillna seems to do an extra copy no matter what).