Skip to content

Imputation

tablite.imputation

Classes

Functions

tablite.imputation.imputation(T, targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm, pbar=None)

In statistics, imputation is the process of replacing missing data with substituted values.

See more: https://en.wikipedia.org/wiki/Imputation_(statistics)

PARAMETER DESCRIPTION
table

source table.

TYPE: Table

targets

column names to find and replace missing values

TYPE: str or list of strings

missing

values to be replaced.

TYPE: None or iterable DEFAULT: None

method

method to be used for replacement. Options:

'carry forward': takes the previous value, and carries forward into fields where values are missing. +: quick. Realistic on time series. -: Can produce strange outliers.

'mean': calculates the column mean (exclude missing) and copies the mean in as replacement. +: quick -: doesn't work on text. Causes data set to drift towards the mean.

'mode': calculates the column mode (exclude missing) and copies the mean in as replacement. +: quick -: most frequent value becomes over-represented in the sample

'nearest neighbour': calculates normalised distance between items in source columns selects nearest neighbour and copies value as replacement. +: works for any datatype. -: computationally intensive (e.g. slow)

TYPE: str DEFAULT: 'carry forward'

sources

NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used.

TYPE: list of strings DEFAULT: None

RETURNS DESCRIPTION
table

table with replaced values.

Source code in tablite/imputation.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def imputation(T, targets, missing=None, method="carry forward", sources=None, tqdm=_tqdm, pbar=None):
    """
    In statistics, imputation is the process of replacing missing data with substituted values.

    See more: https://en.wikipedia.org/wiki/Imputation_(statistics)

    Args:
        table (Table): source table.

        targets (str or list of strings): column names to find and
            replace missing values

        missing (None or iterable): values to be replaced.

        method (str): method to be used for replacement. Options:

            'carry forward':
                takes the previous value, and carries forward into fields
                where values are missing.
                +: quick. Realistic on time series.
                -: Can produce strange outliers.

            'mean':
                calculates the column mean (exclude `missing`) and copies
                the mean in as replacement.
                +: quick
                -: doesn't work on text. Causes data set to drift towards the mean.

            'mode':
                calculates the column mode (exclude `missing`) and copies
                the mean in as replacement.
                +: quick
                -: most frequent value becomes over-represented in the sample

            'nearest neighbour':
                calculates normalised distance between items in source columns
                selects nearest neighbour and copies value as replacement.
                +: works for any datatype.
                -: computationally intensive (e.g. slow)

        sources (list of strings): NEAREST NEIGHBOUR ONLY
            column names to be used during imputation.
            if None or empty, all columns will be used.

    Returns:
        table: table with replaced values.
    """
    sub_cls_check(T, BaseTable)

    if isinstance(targets, str) and targets not in T.columns:
        targets = [targets]
    if isinstance(targets, list):
        for name in targets:
            if not isinstance(name, str):
                raise TypeError(f"expected str, not {type(name)}")
            if name not in T.columns:
                raise ValueError(f"target item {name} not a column name in T.columns:\n{T.columns}")
    else:
        raise TypeError("Expected source as list of column names")

    if missing is None:
        missing = {None}
    else:
        missing = set(missing)

    if method == "nearest neighbour":
        if sources in (None, []):
            sources = list(T.columns)
        if isinstance(sources, str):
            sources = [sources]
        if isinstance(sources, list):
            for name in sources:
                if not isinstance(name, str):
                    raise TypeError(f"expected str, not {type(name)}")
                if name not in T.columns:
                    raise ValueError(f"source item {name} not a column name in T.columns:\n{T.columns}")
        else:
            raise TypeError("Expected source as list of column names")

    methods = ["nearest neighbour", "mean", "mode", "carry forward"]

    if method == "carry forward":
        return carry_forward(T, targets, missing, tqdm=tqdm, pbar=pbar)
    elif method in {"mean", "mode"}:
        return stats_method(T, targets, missing, method, tqdm=tqdm, pbar=pbar)
    elif method == "nearest neighbour":
        return nearest_neighbour(T, sources, missing, targets, tqdm=tqdm)
    else:
        raise ValueError(f"method {method} not recognised amonst known methods: {list(methods)})")

tablite.imputation.carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None)

Source code in tablite/imputation.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None):
    assert isinstance(missing, set)

    if pbar is None:
        total = len(targets) * len(T)
        pbar = tqdm(total=total, desc="imputation.carry_forward", disable=Config.TQDM_DISABLE)

    new = T.copy()
    for name in T.columns:
        if name in targets:
            data = T[name][:]  # create copy
            last_value = None
            for ix, v in enumerate(data):
                if v in missing:  # perform replacement
                    data[ix] = last_value
                else:  # keep last value.
                    last_value = v
                pbar.update(1)
            new[name] = data
        else:
            new[name] = T[name]

    return new

tablite.imputation.stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None)

Source code in tablite/imputation.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None):
    assert isinstance(missing, set)

    if pbar is None:
        total = len(targets)
        pbar = tqdm(total=total, desc=f"imputation.{method}", disable=Config.TQDM_DISABLE)

    new = T.copy()
    for name in T.columns:
        if name in targets:
            col = T.columns[name]
            assert isinstance(col, Column)

            hist_values, hist_counts = col.histogram()

            for m in missing:
                try:
                    idx = hist_values.index(m)
                    hist_counts[idx] = 0
                except ValueError:
                    pass

            stats = summary_statistics(hist_values, hist_counts)

            new_value = stats[method]
            col.replace(mapping={m: new_value for m in missing})
            new[name] = col
            pbar.update(1)
        else:
            new[name] = T[name]  # no entropy, keep as is.

    return new

Modules