Source code for static_frame.core.store_filter

from __future__ import annotations

import numpy as np
import typing_extensions as tp

from static_frame.core.interface_meta import InterfaceMeta
from static_frame.core.util import COMPLEX_TYPES
from static_frame.core.util import DT64_MONTH
from static_frame.core.util import DT64_YEAR
from static_frame.core.util import DTYPE_BOOL
from static_frame.core.util import DTYPE_COMPLEX_KIND
from static_frame.core.util import DTYPE_FLOAT_KIND
from static_frame.core.util import DTYPE_INEXACT_KINDS
from static_frame.core.util import DTYPE_INT_KINDS
from static_frame.core.util import DTYPE_NAT_KINDS
from static_frame.core.util import DTYPE_OBJECT
from static_frame.core.util import DTYPE_OBJECT_KIND
from static_frame.core.util import DTYPE_STR_KINDS
from static_frame.core.util import EMPTY_SET
from static_frame.core.util import FLOAT_TYPES
from static_frame.core.util import NAT
from static_frame.core.util import NAT_STR
from static_frame.core.util import frozenset_filter

if tp.TYPE_CHECKING:
    TNDArrayAny = np.ndarray[tp.Any, tp.Any] #pragma: no cover
    # TDtypeAny = np.dtype[tp.Any] #pragma: no cover


[docs]
class StoreFilter(metaclass=InterfaceMeta):
    '''
    Utility for defining and applying translation of values going to and from a data store, as needed for XLSX and other writers.
    '''

    __slots__ = (
            'from_nan',
            'from_nat',
            'from_none',
            'from_posinf',
            'from_neginf',
            'to_nan',
            'to_nat',
            'to_none',
            'to_posinf',
            'to_neginf',

            '_FLOAT_FUNC_TO_FROM',
            '_EQUAL_FUNC_TO_FROM',
            '_TYPE_TO_TO_SET',
            '_TYPE_TO_TO_TUPLE',

            'value_format_float_scientific',
            'value_format_float_positional',
            'value_format_complex_scientific',
            'value_format_complex_positional',

            '_value_format_active',
            )

    # from type to string (encoding into the data store)
    from_nan: tp.Optional[str]
    from_nat: tp.Optional[str]
    from_none: tp.Optional[str]
    from_posinf: tp.Optional[str]
    from_neginf: tp.Optional[str]

    # from string to type (decoding from the data store)
    to_nan: tp.FrozenSet[str]
    to_nat: tp.FrozenSet[str]
    to_none: tp.FrozenSet[str]
    to_posinf: tp.FrozenSet[str]
    to_neginf: tp.FrozenSet[str]

    # formatting for inexact types, from type to string
    value_format_float_scientific: tp.Optional[str]
    value_format_float_positional: tp.Optional[str]
    value_format_complex_scientific: tp.Optional[str]
    value_format_complex_positional: tp.Optional[str]

    # reference collections defined with values given above; cannot use TCallableAny here
    _FLOAT_FUNC_TO_FROM: tp.Tuple[tp.Tuple[tp.Any, tp.Optional[str]], ...]
    _EQUAL_FUNC_TO_FROM: tp.Tuple[tp.Tuple[tp.Any, tp.Optional[str]], ...]
    _TYPE_TO_TO_SET: tp.Tuple[tp.Tuple[tp.Any, tp.FrozenSet[str]], ...]
    _TYPE_TO_TO_TUPLE: tp.Tuple[tp.Tuple[tp.Any, tp.Tuple[str, ...]], ...]


[docs]
    def __init__(self, *,
            # from type to str
            from_nan: tp.Optional[str] = '',
            from_nat: tp.Optional[str] = '',
            from_none: tp.Optional[str] = 'None',
            from_posinf: tp.Optional[str] = 'inf',
            from_neginf: tp.Optional[str] = '-inf',
            # str to type
            to_nan: tp.Collection[str] = frozenset(('', 'nan', 'NaN', 'NAN', 'NULL', '#N/A')),
            to_nat: tp.Collection[str] = frozenset(()), # do not assume there are NaTs.
            to_none: tp.Collection[str] = frozenset(('None',)),
            to_posinf: tp.Collection[str] = frozenset(('inf',)),
            to_neginf: tp.Collection[str] = frozenset(('-inf',)),
            # from float to str
            value_format_float_positional: tp.Optional[str] = None,
            value_format_float_scientific: tp.Optional[str] = None,
            value_format_complex_positional: tp.Optional[str] = None,
            value_format_complex_scientific: tp.Optional[str] = None,
            ) -> None:

        self.from_nan = from_nan
        self.from_nat = from_nat
        self.from_none = from_none
        self.from_posinf = from_posinf
        self.from_neginf = from_neginf

        self.to_nan = frozenset_filter(to_nan)
        self.to_nat = frozenset_filter(to_nat)
        self.to_none = frozenset_filter(to_none)
        self.to_posinf = frozenset_filter(to_posinf)
        self.to_neginf = frozenset_filter(to_neginf)

        self.value_format_float_positional = value_format_float_positional
        self.value_format_float_scientific = value_format_float_scientific
        self.value_format_complex_positional = value_format_complex_positional
        self.value_format_complex_scientific = value_format_complex_scientific

        self._value_format_active = (
                self.value_format_float_positional is not None or
                self.value_format_float_scientific is not None or
                self.value_format_complex_positional is not None or
                self.value_format_complex_scientific is not None
                )

        # assumed faster to define these per instance than at the class level
        # None has to be handled separately
        self._FLOAT_FUNC_TO_FROM = (
                (np.isnan, self.from_nan),
                (np.isposinf, self.from_posinf),
                (np.isneginf, self.from_neginf)
                )

        # for object array processing
        self._EQUAL_FUNC_TO_FROM = (
                # NOTE: this using the same heuristic as util.isna_array, which may not be the best choice for non-standard objects
                (lambda x: np.not_equal(x, x), self.from_nan),
                (lambda x: np.equal(x, None), self.from_none), # type: ignore
                (lambda x: np.equal(x, np.inf), self.from_posinf),
                (lambda x: np.equal(x, -np.inf), self.from_neginf)
                )

        #-----------------------------------------------------------------------
        # these are used for converting from strings to types
        self._TYPE_TO_TO_SET = (
                (np.nan, self.to_nan),
                (NAT, self.to_nat),
                (None, self.to_none),
                (np.inf, self.to_posinf),
                (-np.inf, self.to_neginf)
                )

        # for using isin, cannot use a set, so pre-convert to tuples here
        self._TYPE_TO_TO_TUPLE = (
                (np.nan, tuple(self.to_nan)),
                (NAT, tuple(self.to_nat)),
                (None, tuple(self.to_none)),
                (np.inf, tuple(self.to_posinf)),
                (-np.inf, tuple(self.to_neginf)),
                )


    # --------------------------------------------------------------------------
    # converting from types (in memory) to data store

    def _format_inexact_element(self,
            value: tp.Any,
            kind: str,
            ) -> tp.Any:
        '''
        Must let unexact types pass, as object arrays will have mixed types.
        '''
        if kind == DTYPE_OBJECT_KIND:
            is_float = isinstance(value, FLOAT_TYPES)
            is_complex = False if is_float else isinstance(value, COMPLEX_TYPES)
        elif kind == DTYPE_FLOAT_KIND:
            is_float = True
            is_complex = False
        elif kind == DTYPE_COMPLEX_KIND:
            is_float = False
            is_complex = True

        if not is_float and not is_complex:
            return value

        # NOTE: similar move in Display.to_cell
        # must call built-in str() to get native realization per element
        is_scientific = 'e' in str(value)
        if is_float:
            if self.value_format_float_scientific is not None and is_scientific:
                return self.value_format_float_scientific.format(value)
            elif self.value_format_float_positional is not None:
                return self.value_format_float_positional.format(value)
            return value

        # is_complex
        if self.value_format_complex_scientific is not None and is_scientific:
            return self.value_format_complex_scientific.format(value)
        elif self.value_format_complex_positional is not None:
            return self.value_format_complex_positional.format(value)
        return value

    def _format_inexact_array(self,
            array: TNDArrayAny,
            array_object: tp.Optional[TNDArrayAny],
            ) -> TNDArrayAny:
        '''
        Args:
            array_object: if we have already created an object array, use it as destination, mutating values in-place. ``array`` and ``array_object`` can be the same array.
        '''
        # NOTE: assume only called on object or inexact dtypes, and when at least one of the value_format attributes is non-None
        kind = array.dtype.kind
        if array_object is None:
            if kind == DTYPE_OBJECT_KIND:
                array_object = array.copy()
            else:
                array_object = array.astype(DTYPE_OBJECT)

        func = self._format_inexact_element
        for iloc, e in np.ndenumerate(array):
            array_object[iloc] = func(e, kind)

        return array_object


[docs]
    def from_type_filter_array(self,
            array: TNDArrayAny
            ) -> TNDArrayAny:
        '''Given an array, replace types with strings
        '''
        kind = array.dtype.kind
        dtype = array.dtype

        if kind in DTYPE_INT_KINDS or kind in DTYPE_STR_KINDS or dtype == DTYPE_BOOL:
            return array # no replacements possible

        kind_is_complex = kind == DTYPE_COMPLEX_KIND
        kind_is_object = kind == DTYPE_OBJECT_KIND

        if kind in DTYPE_INEXACT_KINDS or kind_is_object:
            func_value_replace_pairs = (
                    self._EQUAL_FUNC_TO_FROM if kind_is_object
                    else self._FLOAT_FUNC_TO_FROM)

            post = None # defer creating until we have a match
            for func, value_replace in func_value_replace_pairs:
                if value_replace is not None:
                    # cannot use these ufuncs on complex array
                    if kind_is_complex and (func == np.isposinf or func == np.isneginf):
                        continue
                    found = func(array)
                    if found.any():
                        if post is None:
                            # need to store string replacements in object type
                            # astype always returns a copy by default
                            post = array.astype(DTYPE_OBJECT)
                        post[found] = value_replace

            array_final = post if post is not None else array
            if self._value_format_active:
                return self._format_inexact_array(array_final, post)
            return array_final

        if kind in DTYPE_NAT_KINDS:
            post = None
            if array.dtype == DT64_YEAR or array.dtype == DT64_MONTH:
                post = array.astype(str) # nat will go to 'NaT'

            if post is not None and post.dtype.kind in DTYPE_STR_KINDS:
                is_nat = post == NAT_STR
            else: # still datetime
                is_nat = np.isnat(array)

            # we always force datetime64 to object, as most formats (i.e., XLSX) are not prepared to write them
            post = post if post is not None else array.astype(DTYPE_OBJECT)
            if is_nat.any():
                post[is_nat] = self.from_nat
            return post if post is not None else array

        raise NotImplementedError(f'no handling for dtype {dtype}') #pragma: no cover



[docs]
    def from_type_filter_element(self,
            value: tp.Any
            ) -> tp.Any:
        '''
        Filter single values to string.
        '''
        # apply to all types
        if self.from_none is not None and value is None:
            return self.from_none

        is_float = isinstance(value, FLOAT_TYPES)
        is_complex = False if is_float else isinstance(value, COMPLEX_TYPES)

        if is_float or is_complex:
            for func, value_replace in self._FLOAT_FUNC_TO_FROM:
                if value_replace is not None:
                    if is_complex and (func == np.isposinf or func == np.isneginf):
                        continue
                    if func(value):
                        return value_replace
        if isinstance(value, np.datetime64):
            if np.isnat(value):
                value = self.from_nat
            elif value.dtype == DT64_YEAR or value.dtype == DT64_MONTH:
                value = str(value) # convert year, month to string

        if self._value_format_active:
            if is_float:
                kind = DTYPE_FLOAT_KIND
            elif is_complex:
                kind = DTYPE_COMPLEX_KIND
            else:
                kind = DTYPE_OBJECT_KIND
            return self._format_inexact_element(value, kind)
        return value


    #---------------------------------------------------------------------------
    # converting from strings (in data store) to types


[docs]
    def to_type_filter_array(self,
            array: TNDArrayAny
            ) -> TNDArrayAny:
        '''Given an array, replace strings with types.
        '''
        kind = array.dtype.kind
        dtype = array.dtype

        # nothin to do with ints, floats, or bools
        if (kind in DTYPE_INT_KINDS
                or kind in DTYPE_INEXACT_KINDS
                or dtype == DTYPE_BOOL
                ):
            return array # no replacements possible

        # need to only check object or float
        if kind in DTYPE_STR_KINDS or dtype == DTYPE_OBJECT:
            # for string types, cannot use np.equal
            post = None
            for value_replace, matching in self._TYPE_TO_TO_TUPLE:
                if matching:
                    found = np.isin(array, matching)
                    if found.any():
                        if post is None:
                            post = array.astype(object) # get a copy to mutate
                        post[found] = value_replace
            return post if post is not None else array

        return array



[docs]
    def to_type_filter_element(self,
            value: tp.Any
            ) -> tp.Any:
        '''
        Given a value wich may be an encoded string, decode into a type.
        '''
        if isinstance(value, str):
            for value_replace, matching in self._TYPE_TO_TO_SET:
                if value in matching:
                    return value_replace
        return value



    # def to_type_filter_iterable(self, iterable: tp.Iterable[tp.Any]) -> tp.Iterator[tp.Any]:
    #     for value in iterable:
    #         yield self.to_type_filter_element(value)


STORE_FILTER_DEFAULT = StoreFilter()

STORE_FILTER_DISABLE = StoreFilter(
            from_nan=None,
            from_none=None,
            from_posinf=None,
            from_neginf=None,
            # str to type
            to_nan=EMPTY_SET,
            to_none=EMPTY_SET,
            to_posinf=EMPTY_SET,
            to_neginf=EMPTY_SET,
            )