Redux

`tablite.redux`

Attributes

Classes

Functions

`tablite.redux.filter_all(T, **kwargs)`

returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable

Examples:

t = Table()
t['a'] = [1,2,3,4]
t['b'] = [10,20,30,40]

def f(x):
    return x == 4
def g(x):
    return x < 20

t2 = t.any( **{"a":f, "b":g})
assert [r for r in t2.rows] == [[1, 10], [4, 40]]

t2 = t.any(a=f,b=g)
assert [r for r in t2.rows] == [[1, 10], [4, 40]]

def h(x):
    return x>=2

def i(x):
    return x<=30

t2 = t.all(a=h,b=i)
assert [r for r in t2.rows] == [[2,20], [3, 30]]

Source code in tablite/redux.py

def filter_all(T, **kwargs):
    """
    returns Table for rows where ALL kwargs match
    :param kwargs: dictionary with headers and values / boolean callable

    Examples:

        t = Table()
        t['a'] = [1,2,3,4]
        t['b'] = [10,20,30,40]

        def f(x):
            return x == 4
        def g(x):
            return x < 20

        t2 = t.any( **{"a":f, "b":g})
        assert [r for r in t2.rows] == [[1, 10], [4, 40]]

        t2 = t.any(a=f,b=g)
        assert [r for r in t2.rows] == [[1, 10], [4, 40]]

        def h(x):
            return x>=2

        def i(x):
            return x<=30

        t2 = t.all(a=h,b=i)
        assert [r for r in t2.rows] == [[2,20], [3, 30]]


    """
    sub_cls_check(T, BaseTable)

    if not isinstance(kwargs, dict):
        raise TypeError("did you forget to add the ** in front of your dict?")
    if not all([k in T.columns for k in kwargs]):
        raise ValueError(f"Unknown column(s): {[k for k in kwargs if k not in T.columns]}")

    mask = np.full((len(T),), True)
    for k, v in kwargs.items():
        col = T[k]
        for start, end, page in col.iter_by_page():
            data = page.get()
            if callable(v):
                vf = np.frompyfunc(v, 1, 1)
                mask[start:end] = mask[start:end] & np.apply_along_axis(vf, 0, data)
            else:
                mask[start:end] = mask[start:end] & (data == v)

    return _compress_one(T, mask)

`tablite.redux.drop(T, *args)`

drops all rows that contain args

PARAMETER	DESCRIPTION
`T`	TYPE: `Table`

Source code in tablite/redux.py

def drop(T, *args):
    """drops all rows that contain args

    Args:
        T (Table):
    """
    sub_cls_check(T, BaseTable)
    mask = np.full((len(T),), False)
    for name in T.columns:
        col = T[name]
        for start, end, page in col.iter_by_page():
            data = page.get()
            for arg in args:
                mask[start:end] = mask[start:end] | (data == arg)

    mask = np.invert(mask)
    return _compress_one(T, mask)

`tablite.redux.filter_any(T, **kwargs)`

returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable

Source code in tablite/redux.py

def filter_any(T, **kwargs):
    """
    returns Table for rows where ANY kwargs match
    :param kwargs: dictionary with headers and values / boolean callable
    """
    sub_cls_check(T, BaseTable)
    if not isinstance(kwargs, dict):
        raise TypeError("did you forget to add the ** in front of your dict?")

    mask = np.full((len(T),), False)
    for k, v in kwargs.items():
        col = T[k]
        for start, end, page in col.iter_by_page():
            data = page.get()
            if callable(v):
                vf = np.frompyfunc(v, 1, 1)
                mask[start:end] = mask[start:end] | np.apply_along_axis(vf, 0, data)
            else:
                mask[start:end] = mask[start:end] | (v == data)

    return _compress_one(T, mask)

`tablite.redux.compress_both(T, mask, pbar: _tqdm)`

Source code in tablite/redux.py

def compress_both(T, mask, pbar: _tqdm):
    # NOTE FOR DEVELOPERS:
    # np.compress is so fast that the overhead of multiprocessing doesn't pay off.
    cls = type(T)
    true, false = cls(), cls()

    pbar_div = (len(T.columns) * len(list(Config.page_steps(len(T)))) - 1)
    pbar_step = (10 / pbar_div) if pbar_div != 0 else 0

    for name in T.columns:
        true.add_column(name)
        false.add_column(name)
        true_col = true[name]  # fetch the col to avoid doing it in the loop below
        false_col = false[name]
        # prevent OOMError by slicing the getitem ops
        for start, end in Config.page_steps(len(T)):
            data = T[name][start:end]
            true_col.extend(np.compress(mask[start:end], data))
            false_col.extend(np.compress(np.invert(mask)[start:end], data))
            if pbar is not None:
                pbar.update(pbar_step)
    return true, false

`tablite.redux.get_filter_bitmap(T, expressions, pbar: _tqdm)`

Source code in tablite/redux.py

def get_filter_bitmap(T, expressions, pbar: _tqdm):
    for expression in expressions:
        if not isinstance(expression, dict):
            raise TypeError(f"invalid expression: {expression}")
        if not len(expression) == 3:
            raise ValueError(f"expected 3 items, got {expression}")
        x = {"column1", "column2", "criteria", "value1", "value2"}
        if not set(expression.keys()).issubset(x):
            raise ValueError(f"got unknown key: {set(expression.keys()).difference(x)}")

        if expression["criteria"] not in filter_ops:
            raise ValueError(f"criteria missing from {expression}")

        c1 = expression.get("column1", None)
        if c1 is not None and c1 not in T.columns:
            raise ValueError(f"no such column: {c1}")

        v1 = expression.get("value1", None)
        if v1 is not None and c1 is not None:
            raise ValueError("filter can only take 1 left expr element. Got 2.")

        c2 = expression.get("column2", None)
        if c2 is not None and c2 not in T.columns:
            raise ValueError(f"no such column: {c2}")

        v2 = expression.get("value2", None)
        if v2 is not None and c2 is not None:
            raise ValueError("filter can only take 1 right expression element. Got 2.")

    # EVALUATION....
    # 1. setup a rectangular bitmap for evaluations
    bitmap = np.empty(shape=(len(expressions), len(T)), dtype=bool)
    pbar_div = (len(expressions) * len(list(Config.page_steps(len(T)))) - 1)
    pbar_step = (10 / pbar_div) if pbar_div != 0 else 0
    # 2. create tasks for evaluations
    for bit_index, expression in enumerate(expressions):
        assert isinstance(expression, dict)
        assert len(expression) == 3
        c1 = expression.get("column1", None)
        c2 = expression.get("column2", None)
        expr = expression.get("criteria", None)
        assert expr in filter_ops
        v1 = expression.get("value1", None)
        v2 = expression.get("value2", None)

        for start, end in Config.page_steps(len(T)):
            if c1 is not None:
                dset_A = T[c1][start:end]
            else:  # v1 is active:
                dset_A = np.array([v1] * (end - start))

            if c2 is not None:
                dset_B = T[c2][start:end]
            else:  # v2 is active:
                dset_B = np.array([v2] * (end - start))

            if len(dset_A) != len(dset_B):
                raise ValueError(
                    f"Assymmetric dataset: {c1} has {len(dset_A)} values, whilst {c2} has {len(dset_B)} values."
                )
            # Evaluate
            try:
                if expr == ">":
                    result = dset_A > dset_B
                elif expr == ">=":
                    result = dset_A >= dset_B
                elif expr == "==":
                    result = dset_A == dset_B
                elif expr == "<":
                    result = dset_A < dset_B
                elif expr == "<=":
                    result = dset_A <= dset_B
                elif expr == "!=":
                    result = dset_A != dset_B
                else:  # it's a python evaluations (slow)
                    f = filter_ops.get(expr)
                    assert callable(f)
                    result = list_to_np_array([f(a, b) for a, b in zip(dset_A, dset_B)])
            except TypeError:
                def safe_test(f, a, b):
                    try:
                        return f(a, b)
                    except TypeError:
                        return False
                f = filter_ops.get(expr)
                assert callable(f)
                result = list_to_np_array([safe_test(f, a, b) for a, b in zip(dset_A, dset_B)])
            bitmap[bit_index, start:end] = result
            if pbar is not None:
                pbar.update(pbar_step)

    return bitmap

`tablite.redux.filter_non_primitive(T, expressions, filter_type='all', tqdm=_tqdm)`

OBSOLETE filters table

PARAMETER	DESCRIPTION
`T`	Table. TYPE: `Table subclass`
`expressions`	str: filters based on an expression, such as: "all((A==B, C!=4, 200<D))" which is interpreted using python's compiler to: `def _f(A,B,C,D): return all((A==B, C!=4, 200<D))` list of dicts: (example): L = [ {'column1':'A', 'criteria': "==", 'column2': 'B'}, {'column1':'C', 'criteria': "!=", "value2": '4'}, {'value1': 200, 'criteria': "<", column2: 'D' } ] TYPE: `list or str`
`accepted`	'column1', 'column2', 'criteria', 'value1', 'value2' TYPE: `dictionary keys`
`filter_type`	Ignored if expressions is str. 'all' or 'any'. Defaults to "all". TYPE: `str` DEFAULT: `'all'`
`tqdm`	progressbar. Defaults to _tqdm. TYPE: `tqdm` DEFAULT: `tqdm`

RETURNS	DESCRIPTION
`2xTables`	trues, falses

Source code in tablite/redux.py

def filter_non_primitive(T, expressions, filter_type="all", tqdm=_tqdm):
    """
    OBSOLETE
    filters table


    Args:
        T (Table subclass): Table.
        expressions (list or str):
            str:
                filters based on an expression, such as:
                "all((A==B, C!=4, 200<D))"
                which is interpreted using python's compiler to:

                def _f(A,B,C,D):
                    return all((A==B, C!=4, 200<D))

            list of dicts: (example):

            L = [
                {'column1':'A', 'criteria': "==", 'column2': 'B'},
                {'column1':'C', 'criteria': "!=", "value2": '4'},
                {'value1': 200, 'criteria': "<", column2: 'D' }
            ]

        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'

        filter_type (str, optional): Ignored if expressions is str.
            'all' or 'any'. Defaults to "all".
        tqdm (tqdm, optional): progressbar. Defaults to _tqdm.

    Returns:
        2xTables: trues, falses
    """
    # determine method
    warnings.warn("Filter using non-primitive types is not recommended.")
    sub_cls_check(T, BaseTable)
    if len(T) == 0:
        return T.copy(), T.copy()

    with tqdm(desc="filter", total=20) as pbar:
        if isinstance(expressions, str):
            mask = _filter_using_expression(T, expressions)
            pbar.update(10)
        elif isinstance(expressions, list):
            mask = _filter_using_list_of_dicts(T, expressions, filter_type, pbar)
        else:
            raise TypeError
        # create new tables
        res = compress_both(T, mask, pbar=pbar)
        pbar.update(pbar.total - pbar.n)

        return res

`tablite.redux.filter(T, expressions, filter_type='all', tqdm=_tqdm)`

filters table Note: At the moment only tablite primitive types are supported

PARAMETER	DESCRIPTION
`T`	Table. TYPE: `Table subclass`
`expressions`	str: filters based on an expression, such as: "all((A==B, C!=4, 200<D))" which is interpreted using python's compiler to: `def _f(A,B,C,D): return all((A==B, C!=4, 200<D))` list of dicts: (example): L = [ {'column1':'A', 'criteria': "==", 'column2': 'B'}, {'column1':'C', 'criteria': "!=", "value2": '4'}, {'value1': 200, 'criteria': "<", column2: 'D' } ] TYPE: `list or str`
`accepted`	'column1', 'column2', 'criteria', 'value1', 'value2' TYPE: `dictionary keys`
`filter_type`	Ignored if expressions is str. 'all' or 'any'. Defaults to "all". TYPE: `str` DEFAULT: `'all'`
`tqdm`	progressbar. Defaults to _tqdm. TYPE: `tqdm` DEFAULT: `tqdm`

RETURNS	DESCRIPTION
`2xTables`	trues, falses

Source code in tablite/redux.py

def filter(T, expressions, filter_type="all", tqdm=_tqdm):
    """filters table
    Note: At the moment only tablite primitive types are supported

    Args:
        T (Table subclass): Table.
        expressions (list or str):
            str:
                filters based on an expression, such as:
                "all((A==B, C!=4, 200<D))"
                which is interpreted using python's compiler to:

                def _f(A,B,C,D):
                    return all((A==B, C!=4, 200<D))

            list of dicts: (example):

            L = [
                {'column1':'A', 'criteria': "==", 'column2': 'B'},
                {'column1':'C', 'criteria': "!=", "value2": '4'},
                {'value1': 200, 'criteria': "<", column2: 'D' }
            ]

        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'

        filter_type (str, optional): Ignored if expressions is str.
            'all' or 'any'. Defaults to "all".
        tqdm (tqdm, optional): progressbar. Defaults to _tqdm.

    Returns:
        2xTables: trues, falses
    """
    # determine method
    sub_cls_check(T, BaseTable)
    if len(T) == 0:
        return T.copy(), T.copy()

    if isinstance(expressions, str):
        with tqdm(desc="filter", total=20) as pbar:
            # TODO: make parser for expressions and use the nim implement
            mask = _filter_using_expression(T, expressions)
            pbar.update(10)
            res = compress_both(T, mask, pbar=pbar)
            pbar.update(pbar.total - pbar.n)
    elif isinstance(expressions, list):
        return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)
    else:
        raise TypeError
        # create new tables

    return res