Skip to content

File reader utils

tablite.file_reader_utils

Attributes

tablite.file_reader_utils.ENCODING_GUESS_BYTES = 10000 module-attribute

tablite.file_reader_utils.header_readers = {'fods': excel_reader_headers, 'json': excel_reader_headers, 'simple': excel_reader_headers, 'rst': excel_reader_headers, 'mediawiki': excel_reader_headers, 'xlsx': excel_reader_headers, 'xlsm': excel_reader_headers, 'csv': text_reader_headers, 'tsv': text_reader_headers, 'txt': text_reader_headers, 'ods': ods_reader_headers} module-attribute

Classes

tablite.file_reader_utils.TextEscape(openings='({[', closures=']})', text_qualifier='"', delimiter=',', strip_leading_and_tailing_whitespace=False)

Bases: object

enables parsing of CSV with respecting brackets and text marks.

Example: text_escape = TextEscape() # set up the instance. for line in somefile.readlines(): list_of_words = text_escape(line) # use the instance. ...

As an example, the Danes and Germans use " for inches and ' for feet, so we will see data that contains nail (75 x 4 mm, 3" x 3/12"), so for this case ( and ) are valid escapes, but " and ' aren't.

Source code in tablite/file_reader_utils.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def __init__(
    self,
    openings="({[",
    closures="]})",
    text_qualifier='"',
    delimiter=",",
    strip_leading_and_tailing_whitespace=False,
):
    """
    As an example, the Danes and Germans use " for inches and ' for feet,
    so we will see data that contains nail (75 x 4 mm, 3" x 3/12"), so
    for this case ( and ) are valid escapes, but " and ' aren't.

    """
    if openings is None:
        openings = [None]
    elif isinstance(openings, str):
        self.openings = {c for c in openings}
    else:
        raise TypeError(f"expected str, got {type(openings)}")

    if closures is None:
        closures = [None]
    elif isinstance(closures, str):
        self.closures = {c for c in closures}
    else:
        raise TypeError(f"expected str, got {type(closures)}")

    if not isinstance(delimiter, str):
        raise TypeError(f"expected str, got {type(delimiter)}")
    self.delimiter = delimiter
    self._delimiter_length = len(delimiter)
    self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace

    if text_qualifier is None:
        pass
    elif text_qualifier in openings + closures:
        raise ValueError("It's a bad idea to have qoute character appears in openings or closures.")
    else:
        self.qoute = text_qualifier

    if not text_qualifier:
        if not self.strip_leading_and_tailing_whitespace:
            self.c = self._call_1
        else:
            self.c = self._call_2
    else:
        self.c = self._call_3
Attributes
tablite.file_reader_utils.TextEscape.openings = {c for c in openings} instance-attribute
tablite.file_reader_utils.TextEscape.closures = {c for c in closures} instance-attribute
tablite.file_reader_utils.TextEscape.delimiter = delimiter instance-attribute
tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace instance-attribute
tablite.file_reader_utils.TextEscape.qoute = text_qualifier instance-attribute
tablite.file_reader_utils.TextEscape.c = self._call_1 instance-attribute
Functions
tablite.file_reader_utils.TextEscape.__call__(s)
Source code in tablite/file_reader_utils.py
88
89
def __call__(self, s):
    return self.c(s)

Functions

tablite.file_reader_utils.split_by_sequence(text, sequence)

helper to split text according to a split sequence.

Source code in tablite/file_reader_utils.py
15
16
17
18
19
20
21
22
23
24
25
def split_by_sequence(text, sequence):
    """helper to split text according to a split sequence."""
    chunks = tuple()
    for element in sequence:
        idx = text.find(element)
        if idx < 0:
            raise ValueError(f"'{element}' not in row")
        chunk, text = text[:idx], text[len(element) + idx :]
        chunks += (chunk,)
    chunks += (text,)  # the remaining text.
    return chunks

tablite.file_reader_utils.detect_seperator(text)

:param path: pathlib.Path objects :param encoding: file encoding. :return: 1 character.

Source code in tablite/file_reader_utils.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def detect_seperator(text):
    """
    :param path: pathlib.Path objects
    :param encoding: file encoding.
    :return: 1 character.
    """
    # After reviewing the logic in the CSV sniffer, I concluded that all it
    # really does is to look for a non-text character. As the separator is
    # determined by the first line, which almost always is a line of headers,
    # the text characters will be utf-8,16 or ascii letters plus white space.
    # This leaves the characters ,;:| and \t as potential separators, with one
    # exception: files that use whitespace as separator. My logic is therefore
    # to (1) find the set of characters that intersect with ',;:|\t' which in
    # practice is a single character, unless (2) it is empty whereby it must
    # be whitespace.
    if len(text) == 0:
        return None
    seps = {",", "\t", ";", ":", "|"}.intersection(text)
    if not seps:
        if " " in text:
            return " "
        if "\n" in text:
            return "\n"
        else:
            raise ValueError("separator not detected")
    if len(seps) == 1:
        return seps.pop()
    else:
        frq = [(text.count(i), i) for i in seps]
        frq.sort(reverse=True)  # most frequent first.
        return frq[0][-1]

tablite.file_reader_utils.text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)

Source code in tablite/file_reader_utils.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):
    d = {}
    delimiters = {
        ".csv": ",",
        ".tsv": "\t",
        ".txt": None,
    }

    try:
        with path.open("rb") as fi:
            rawdata = fi.read(ENCODING_GUESS_BYTES)
            encoding = chardet.detect(rawdata)["encoding"]

        if delimiter is None:
            with path.open("r", encoding=encoding, errors="ignore") as fi:
                lines = []
                for n, line in enumerate(fi, -header_row_index):
                    if n < 0:
                        continue
                    line = line.rstrip("\n")
                    lines.append(line)
                    if n >= linecount:
                        break  # break on first
                try:
                    d["delimiter"] = delimiter = detect_seperator("\n".join(lines))
                except ValueError as e:
                    if e.args == ("separator not detected", ):
                        d["delimiter"] = delimiter = None # this will handle the case of 1 column, 1 row
                    else:
                        raise e

        if delimiter is None:
            d["delimiter"] = delimiter = delimiters[path.suffix]  # pickup the default one
            d[path.name] = [lines]
            d["is_empty"] = True  # mark as empty to return an empty table instead of throwing
        else:
            kwargs = {}

            if text_qualifier is not None:
                kwargs["text_qualifier"] = text_qualifier
                kwargs["quoting"] = "QUOTE_MINIMAL"
            else:
                kwargs["quoting"] = "QUOTE_NONE"

            d[path.name] = _get_headers(
                str(path), py_to_nim_encoding(encoding), header_row_index=header_row_index,
                delimiter=delimiter,
                linecount=linecount,
                **kwargs
            )
        return d
    except Exception as e:
        raise ValueError(f"can't read {path.suffix}")

tablite.file_reader_utils.excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)

Source code in tablite/file_reader_utils.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):
    d = {}
    book = openpyxl.open(str(path), read_only=True)

    try:
        all_sheets = book.sheetnames

        for sheet_name, sheet in ((name, book[name]) for name in all_sheets):
            fixup_worksheet(sheet)
            if sheet.max_row is None:
                max_rows = 0
            else:
                max_rows = min(sheet.max_row, linecount + 1)
            container = [None] * max_rows
            padding_ends = 0
            max_column = sheet.max_column

            for i, row_data in enumerate(sheet.iter_rows(0, header_row_index + max_rows, values_only=True), start=-header_row_index):
                if i < 0:
                    # NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row
                    continue

                row_data = [strip_escape(r) for r in row_data]

                # NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string
                container[i] = [DataTypes.to_json(v) for v in row_data]

                for j, cell in enumerate(reversed(row_data)):
                    if cell is None:
                        continue

                    padding_ends = max(padding_ends, max_column - j)

                    break

            d[sheet_name] = [None if c is None else c[0:padding_ends] for c in container]
            d["delimiter"] = None
    finally:
        book.close()

    return d

tablite.file_reader_utils.ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)

Source code in tablite/file_reader_utils.py
243
244
245
246
247
248
249
250
251
252
253
254
def ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):
    d = {
        "delimiter": None
    }
    sheets = pyexcel.get_book_dict(file_name=str(path))

    for sheet_name, data in sheets.items():
        lines = [[DataTypes.to_json(v) for v in row] for row in data[header_row_index:header_row_index+linecount]]

        d[sheet_name] = lines

    return d

tablite.file_reader_utils.get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10)

file format definition csv comma separated values tsv tab separated values csvz a zip file that contains one or many csv files tsvz a zip file that contains one or many tsv files xls a spreadsheet file format created by MS-Excel 97-2003 xlsx MS-Excel Extensions to the Office Open XML SpreadsheetML File Format. xlsm an MS-Excel Macro-Enabled Workbook file ods open document spreadsheet fods flat open document spreadsheet json java script object notation html html table of the data structure simple simple presentation rst rStructured Text presentation of the data mediawiki media wiki table

Source code in tablite/file_reader_utils.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10):
    """
    file format	definition
    csv	    comma separated values
    tsv	    tab separated values
    csvz	a zip file that contains one or many csv files
    tsvz	a zip file that contains one or many tsv files
    xls	    a spreadsheet file format created by MS-Excel 97-2003
    xlsx	MS-Excel Extensions to the Office Open XML SpreadsheetML File Format.
    xlsm	an MS-Excel Macro-Enabled Workbook file
    ods	    open document spreadsheet
    fods	flat open document spreadsheet
    json	java script object notation
    html	html table of the data structure
    simple	simple presentation
    rst	    rStructured Text presentation of the data
    mediawiki	media wiki table
    """
    if isinstance(path, str):
        path = Path(path)
    if not isinstance(path, Path):
        raise TypeError("expected pathlib path.")
    if not path.exists():
        raise FileNotFoundError(str(path))
    if delimiter is not None:
        if not isinstance(delimiter, str):
            raise TypeError(f"expected str or None, not {type(delimiter)}")

    kwargs = {
        "path": path,
        "delimiter": delimiter,
        "header_row_index": header_row_index,
        "text_qualifier": text_qualifier,
        "linecount": linecount
   }

    reader = header_readers.get(path.suffix[1:], None)

    if reader is None:
        raise TypeError(f"file format for headers not supported: {path.suffix}")

    result = reader(**kwargs)

    return result

tablite.file_reader_utils.get_encoding(path, nbytes=ENCODING_GUESS_BYTES)

Source code in tablite/file_reader_utils.py
319
320
321
322
323
324
325
326
def get_encoding(path, nbytes=ENCODING_GUESS_BYTES):
    nbytes = min(nbytes, path.stat().st_size)
    with path.open("rb") as fi:
        rawdata = fi.read(nbytes)
        encoding = chardet.detect(rawdata)["encoding"]
        if encoding == "ascii":  # utf-8 is backwards compatible with ascii
            return "utf-8"  # --   so should the first 10k chars not be enough,
        return encoding  # --      the utf-8 encoding will still get it right.

tablite.file_reader_utils.get_delimiter(path, encoding)

Source code in tablite/file_reader_utils.py
329
330
331
332
333
334
335
336
337
338
339
340
def get_delimiter(path, encoding):
    with path.open("r", encoding=encoding, errors="ignore") as fi:
        lines = []
        for n, line in enumerate(fi):
            line = line.rstrip("\n")
            lines.append(line)
            if n > 10:
                break  # break on first
        delimiter = detect_seperator("\n".join(lines))
        if delimiter is None:
            raise ValueError("Delimiter could not be determined")
        return delimiter