Skip to content

Import utils

tablite.import_utils

Attributes

tablite.import_utils.file_readers = {'fods': excel_reader, 'json': excel_reader, 'html': from_html, 'hdf5': from_hdf5, 'simple': excel_reader, 'rst': excel_reader, 'mediawiki': excel_reader, 'xlsx': excel_reader, 'xls': excel_reader, 'xlsm': excel_reader, 'csv': text_reader, 'tsv': text_reader, 'txt': text_reader, 'ods': ods_reader} module-attribute

tablite.import_utils.valid_readers = ','.join(list(file_readers.keys())) module-attribute

Classes

tablite.import_utils.TRconfig(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields)

Bases: object

Source code in tablite/import_utils.py
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
def __init__(
    self,
    source,
    destination,
    start,
    end,
    guess_datatypes,
    delimiter,
    text_qualifier,
    text_escape_openings,
    text_escape_closures,
    strip_leading_and_tailing_whitespace,
    encoding,
    newline_offsets,
    fields
) -> None:
    self.source = source
    self.destination = destination
    self.start = start
    self.end = end
    self.guess_datatypes = guess_datatypes
    self.delimiter = delimiter
    self.text_qualifier = text_qualifier
    self.text_escape_openings = text_escape_openings
    self.text_escape_closures = text_escape_closures
    self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace
    self.encoding = encoding
    self.newline_offsets = newline_offsets
    self.fields = fields
    type_check(start, int),
    type_check(end, int),
    type_check(delimiter, str),
    type_check(text_qualifier, (str, type(None))),
    type_check(text_escape_openings, str),
    type_check(text_escape_closures, str),
    type_check(encoding, str),
    type_check(strip_leading_and_tailing_whitespace, bool),
    type_check(newline_offsets, list)
    type_check(fields, dict)
Attributes
tablite.import_utils.TRconfig.source = source instance-attribute
tablite.import_utils.TRconfig.destination = destination instance-attribute
tablite.import_utils.TRconfig.start = start instance-attribute
tablite.import_utils.TRconfig.end = end instance-attribute
tablite.import_utils.TRconfig.guess_datatypes = guess_datatypes instance-attribute
tablite.import_utils.TRconfig.delimiter = delimiter instance-attribute
tablite.import_utils.TRconfig.text_qualifier = text_qualifier instance-attribute
tablite.import_utils.TRconfig.text_escape_openings = text_escape_openings instance-attribute
tablite.import_utils.TRconfig.text_escape_closures = text_escape_closures instance-attribute
tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace instance-attribute
tablite.import_utils.TRconfig.encoding = encoding instance-attribute
tablite.import_utils.TRconfig.newline_offsets = newline_offsets instance-attribute
tablite.import_utils.TRconfig.fields = fields instance-attribute
Functions
tablite.import_utils.TRconfig.copy()
Source code in tablite/import_utils.py
465
466
def copy(self):
    return TRconfig(**self.dict())
tablite.import_utils.TRconfig.dict()
Source code in tablite/import_utils.py
468
469
def dict(self):
    return {k: v for k, v in self.__dict__.items() if not (k.startswith("_") or callable(v))}

Functions

tablite.import_utils.from_pandas(T, df)

Creates Table using pd.to_dict('list')

similar to:

import pandas as pd df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]}) df a b 0 1 4 1 2 5 2 3 6 df.to_dict('list')

t = Table.from_dict(df.to_dict('list)) t.show() +===+===+===+ | # | a | b | |row|int|int| +---+---+---+ | 0 | 1| 4| | 1 | 2| 5| | 2 | 3| 6| +===+===+===+

Source code in tablite/import_utils.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def from_pandas(T, df):
    """
    Creates Table using pd.to_dict('list')

    similar to:
    >>> import pandas as pd
    >>> df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})
    >>> df
        a  b
        0  1  4
        1  2  5
        2  3  6
    >>> df.to_dict('list')
    {'a': [1, 2, 3], 'b': [4, 5, 6]}

    >>> t = Table.from_dict(df.to_dict('list))
    >>> t.show()
        +===+===+===+
        | # | a | b |
        |row|int|int|
        +---+---+---+
        | 0 |  1|  4|
        | 1 |  2|  5|
        | 2 |  3|  6|
        +===+===+===+
    """
    if not issubclass(T, BaseTable):
        raise TypeError("Expected subclass of Table")

    return T(columns=df.to_dict("list"))  # noqa

tablite.import_utils.from_hdf5(T, path, tqdm=_tqdm, pbar=None)

imports an exported hdf5 table.

Note that some loss of type information is to be expected in columns of mixed type:

t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|嗨 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|嗨 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+

Source code in tablite/import_utils.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def from_hdf5(T, path, tqdm=_tqdm, pbar=None):
    """
    imports an exported hdf5 table.

    Note that some loss of type information is to be expected in columns of mixed type:
    >>> t.show(dtype=True)
    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+
    | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O |
    |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int|
    +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+
    | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11|
    | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|嗨 | 100000000000000000000000| -inf|-11|
    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+
    >>> t.to_hdf5(filename)
    >>> t2 = Table.from_hdf5(filename)
    >>> t2.show(dtype=True)
    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+
    | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O |
    |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int|
    +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+
    | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11|
    | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|嗨 | 100000000000000000000000| -inf|-11|
    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+
    """
    if not issubclass(T, BaseTable):
        raise TypeError("Expected subclass of Table")
    import h5py

    type_check(path, Path)
    t = T()
    with h5py.File(path, "r") as h5:
        for col_name in h5.keys():
            dset = h5[col_name]
            arr = np.array(dset[:])
            if arr.dtype == object:
                arr = np.array(DataTypes.guess([v.decode("utf-8") for v in arr]))
            t[col_name] = arr
    return t

tablite.import_utils.from_json(T, jsn)

Imports tables exported using .to_json

Source code in tablite/import_utils.py
112
113
114
115
116
117
118
119
120
121
122
def from_json(T, jsn):
    """
    Imports tables exported using .to_json
    """
    if not issubclass(T, BaseTable):
        raise TypeError("Expected subclass of Table")
    import json

    type_check(jsn, str)
    d = json.loads(jsn)
    return T(columns=d["columns"])

tablite.import_utils.from_html(T, path, tqdm=_tqdm, pbar=None)

Source code in tablite/import_utils.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def from_html(T, path, tqdm=_tqdm, pbar=None):
    if not issubclass(T, BaseTable):
        raise TypeError("Expected subclass of Table")
    type_check(path, Path)

    if pbar is None:
        total = path.stat().st_size
        pbar = tqdm(total=total, desc="from_html", disable=Config.TQDM_DISABLE)

    row_start, row_end = "<tr>", "</tr>"
    value_start, value_end = "<th>", "</th>"
    chunk = ""
    t = None  # will be T()
    start, end = 0, 0
    data = {}
    with path.open("r") as fi:
        while True:
            start = chunk.find(row_start, start)  # row tag start
            end = chunk.find(row_end, end)  # row tag end
            if start == -1 or end == -1:
                new = fi.read(100_000)
                pbar.update(len(new))
                if new == "":
                    break
                chunk += new
                continue
            # get indices from chunk
            row = chunk[start + len(row_start) : end]
            fields = [v.rstrip(value_end) for v in row.split(value_start)]
            if not data:
                headers = fields[:]
                data = {f: [] for f in headers}
                continue
            else:
                for field, header in zip(fields, headers):
                    data[header].append(field)

            chunk = chunk[end + len(row_end) :]

            if len(data[headers[0]]) == Config.PAGE_SIZE:
                if t is None:
                    t = T(columns=data)
                else:
                    for k, v in data.items():
                        t[k].extend(DataTypes.guess(v))
                data = {f: [] for f in headers}

    for k, v in data.items():
        t[k].extend(DataTypes.guess(v))
    return t

tablite.import_utils.excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs)

returns Table from excel

**kwargs are excess arguments that are ignored.

Source code in tablite/import_utils.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty="NONE", start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs):
    """
    returns Table from excel

    **kwargs are excess arguments that are ignored.
    """
    if not issubclass(T, BaseTable):
        raise TypeError("Expected subclass of Table")

    book = openpyxl.load_workbook(path, read_only=True, data_only=True)

    if sheet is None:  # help the user.
        """
            If no sheet specified, assume first sheet.

            Reasoning:
                Pandas ODS reader does that, so this preserves parity and it might be expected by users.
                If we don't know the sheet name but only have single sheet,
                    we would need to take extra steps to find out the name of the sheet.
                We already make assumptions in case of column selection,
                    when columns are None, we import all of them.
        """
        sheet = book.sheetnames[0]
    elif sheet not in book.sheetnames:
        raise ValueError(f"sheet not found: {sheet}")

    if not (isinstance(start, int) and start >= 0):
        raise ValueError("expected start as an integer >=0")
    if not (isinstance(limit, int) and limit > 0):
        raise ValueError("expected limit as integer > 0")

    worksheet = book[sheet]
    fixup_worksheet(worksheet)

    try:
        it_header = worksheet.iter_rows(min_row=header_row_index + 1)
        while True:
            # get the first row to know our headers or the number of columns
            row = [strip_escape(c.value) for c in next(it_header)]
            break
        fields = [str(c) if c is not None else "" for c in row] # excel is offset by 1
    except StopIteration:
        # excel was empty, return empty table
        return T()

    if not first_row_has_headers:
        # since the first row did not contain headers, we use the column count to populate header names
        fields = [str(i) for i in range(len(fields))]

    if columns is None:
        # no columns were specified by user to import, that means we import all of the them
        columns = []

        for f in fields:
            # fixup the duplicate column names
            columns.append(unique_name(f, columns))

        field_dict = {k: i for i, k in enumerate(columns)}
    else:
        field_dict = {}

        for k, i in ((k, fields.index(k)) for k in columns):
            # fixup the duplicate column names
            field_dict[unique_name(k, field_dict.keys())] = i

    # calculate our data rows iterator offset
    it_offset = start + (1 if first_row_has_headers else 0) + header_row_index + 1

    # attempt to fetch number of rows in the sheet
    total_rows = worksheet.max_row
    real_tqdm = True

    if total_rows is None:
        # i don't know what causes it but max_row can be None in some cases, so we don't know how large the dataset is
        total_rows = it_offset + limit
        real_tqdm = False

    # create the actual data rows iterator
    it_rows = worksheet.iter_rows(min_row=it_offset, max_row=min(it_offset+limit, total_rows))
    it_used_indices = list(field_dict.values())

    # filter columns that we're not going to use
    it_rows_filtered = ([strip_escape(row[idx].value) for idx in it_used_indices] for row in it_rows)

    # create page directory
    workdir = Path(Config.workdir) / Config.pid
    pagesdir = workdir/"pages"
    pagesdir.mkdir(exist_ok=True, parents=True)

    field_names = list(field_dict.keys())
    column_count = len(field_names)

    page_fhs = None

    # prepopulate the table with columns
    table = T()
    for name in field_names:
        table[name] = Column(table.path)

    pbar_fname = path.name
    if len(pbar_fname) > 20:
        pbar_fname = pbar_fname[0:10] + "..." + pbar_fname[-7:]

    if real_tqdm:
        # we can create a true tqdm progress bar, make one
        tqdm_iter = tqdm(it_rows_filtered, total=total_rows, desc=f"importing excel: {pbar_fname}")
    else:
        """
            openpyxls was unable to precalculate the size of the excel for whatever reason
            forcing recalc would require parsing entire file
            drop the progress bar in that case, just show iterations

            as an alternative we can use Σ=1/x but it just doesn't look good, show iterations per second instead
        """
        tqdm_iter = tqdm(it_rows_filtered, desc=f"importing excel: {pbar_fname}")

    tqdm_iter = iter(tqdm_iter)

    idx = 0

    while True:
        try:
            row = next(tqdm_iter)
        except StopIteration:
            break # because in some cases we can't know the size of excel to set the upper iterator limit we loop until stop iteration is encountered

        if skip_empty == "ALL" and all(v is None for v in row):
            continue
        elif skip_empty == "ANY" and any(v is None for v in row):
            continue

        if idx % Config.PAGE_SIZE == 0:
            if page_fhs is not None:
                # we reached the max page file size, fix the pages
                [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]

            page_fhs = [None] * column_count

            for cidx in range(column_count):
                # allocate new pages
                pg_path = pagesdir / f"{next(Page.ids)}.npy"
                page_fhs[cidx] = open(pg_path, "wb")

        for fh, value in zip(page_fhs, row):
            """
                since excel types are already cast into appropriate type we're going to do two passes per page

                we create our temporary custom format:
                packed type|packed byte count|packed bytes|...

                available types:
                    * q - int64
                    * d - float64
                    * s - string
                    * b - boolean
                    * n - none
                    * p - pickled (date, time, datetime)
            """
            dtype = type(value)

            if dtype == int:
                ptype, bytes_ = b'q', struct.pack('q', value) # pack int as int64
            elif dtype == float:
                ptype, bytes_ = b'd', struct.pack('d', value) # pack float as float64
            elif dtype == str:
                ptype, bytes_ = b's', value.encode("utf-8")   # pack string
            elif dtype == bool:
                ptype, bytes_ = b'b', b'1' if value else b'0' # pack boolean
            elif value is None:
                ptype, bytes_ = b'n', b''                     # pack none
            elif dtype in [date, time, datetime]:
                ptype, bytes_ = b'p', pkl.dumps(value)        # pack object types via pickle
            else:
                raise NotImplementedError()

            byte_count = struct.pack('I', len(bytes_))        # pack our payload size, i doubt payload size can be over uint32

            # dump object to file
            fh.write(ptype)
            fh.write(byte_count)
            fh.write(bytes_)

        idx = idx + 1

    if page_fhs is not None:
        # we reached end of the loop, fix the pages
        [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]

    return table

tablite.import_utils.ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, **kwargs)

returns Table from .ODS

Source code in tablite/import_utils.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
def ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty="NONE", start=0, limit=sys.maxsize, **kwargs):
    """
    returns Table from .ODS
    """
    if not issubclass(T, BaseTable):
        raise TypeError("Expected subclass of Table")

    if sheet is None:
        data = read_excel(str(path), header=None) # selects first sheet
    else:
        data = read_excel(str(path), sheet_name=sheet, header=None)

    data[isna(data)] = None  # convert any empty cells to None
    data = data.to_numpy().tolist() # convert pandas to list

    if skip_empty == "ALL" or skip_empty == "ANY":
        """ filter out all rows based on predicate that come after header row """
        fn_filter = any if skip_empty == "ALL" else all # this is intentional
        data = [
            row
            for ridx, row in enumerate(data)
            if ridx < header_row_index + (1 if first_row_has_headers else 0) or fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)
        ]

    data = np.array(data, dtype=np.object_) # cast back to numpy array for slicing but don't try to convert datatypes

    if not (isinstance(start, int) and start >= 0):
        raise ValueError("expected start as an integer >=0")
    if not (isinstance(limit, int) and limit > 0):
        raise ValueError("expected limit as integer > 0")

    t = T()

    used_columns_names = set()
    for ix, value in enumerate(data[header_row_index]):
        if first_row_has_headers:
            header, start_row_pos = "" if value is None else str(value), (1 + header_row_index)
        else:
            header, start_row_pos = f"_{ix + 1}", (0 + header_row_index)

        if columns is not None:
            if header not in columns:
                continue

        unique_column_name = unique_name(str(header), used_columns_names)
        used_columns_names.add(unique_column_name)

        column_values = data[start_row_pos : start_row_pos + limit, ix]

        t[unique_column_name] = column_values
    return t

tablite.import_utils.text_reader_task(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields)

PARALLEL TASK FUNCTION reads columnsname + path[start:limit] into hdf5.

source: csv or txt file destination: filename for page. start: int: start of page. end: int: end of page. guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess delimiter: ',' ';' or '|' text_qualifier: str: commonly " text_escape_openings: str: default: "({[ text_escape_closures: str: default: ]})" strip_leading_and_tailing_whitespace: bool encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')

Source code in tablite/import_utils.py
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
def text_reader_task(
    source,
    destination,
    start,
    end,
    guess_datatypes,
    delimiter,
    text_qualifier,
    text_escape_openings,
    text_escape_closures,
    strip_leading_and_tailing_whitespace,
    encoding,
    newline_offsets,
    fields
):
    """PARALLEL TASK FUNCTION
    reads columnsname + path[start:limit] into hdf5.

    source: csv or txt file
    destination: filename for page.
    start: int: start of page.
    end: int: end of page.
    guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess
    delimiter: ',' ';' or '|'
    text_qualifier: str: commonly \"
    text_escape_openings: str: default: "({[
    text_escape_closures: str: default: ]})"
    strip_leading_and_tailing_whitespace: bool
    encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')
    """
    if isinstance(source, str):
        source = Path(source)
    type_check(source, Path)
    if not source.exists():
        raise FileNotFoundError(f"File not found: {source}")
    type_check(destination, list)

    # declare CSV dialect.
    delim = delimiter

    class Dialect(csv.Dialect):
        delimiter = delim
        quotechar = '"' if text_qualifier is None else text_qualifier
        escapechar = '\\'
        doublequote = True
        quoting = csv.QUOTE_MINIMAL
        skipinitialspace = False if strip_leading_and_tailing_whitespace is None else strip_leading_and_tailing_whitespace
        lineterminator = "\n"

    with source.open("r", encoding=encoding, errors="ignore") as fi:  # --READ
        fi.seek(newline_offsets[start])
        reader = csv.reader(fi, dialect=Dialect)

        # if there's an issue with file handlers on windows, we can make a special case for windows where the file is opened on demand and appended instead of opening all handlers at once
        page_file_handlers = [open(f, mode="wb") for f in destination]

        # identify longest str
        longest_str = [1 for _ in range(len(destination))]
        for row in (next(reader) for _ in range(end - start)):
            for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):
                longest_str[idx] = max(longest_str[idx], len(c))

        column_formats = [f"<U{i}" for i in longest_str]
        for idx, cf in enumerate(column_formats):
            _create_numpy_header(cf, (end - start, ), page_file_handlers[idx])

        # write page arrays to files
        fi.seek(newline_offsets[start])
        for row in (next(reader) for _ in range(end - start)):
            for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):
                cbytes = np.asarray(c, dtype=column_formats[idx]).tobytes()
                page_file_handlers[idx].write(cbytes)

        [phf.close() for phf in page_file_handlers]

tablite.import_utils.text_reader(T, path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline, guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty, delimiter, text_escape_openings, text_escape_closures, tqdm=_tqdm, **kwargs)

Source code in tablite/import_utils.py
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
def text_reader(
    T,
    path,
    columns,
    first_row_has_headers,
    header_row_index,
    encoding,
    start,
    limit,
    newline,
    guess_datatypes,
    text_qualifier,
    strip_leading_and_tailing_whitespace,
    skip_empty,
    delimiter,
    text_escape_openings,
    text_escape_closures,
    tqdm=_tqdm,
    **kwargs,
):
    if encoding is None:
        encoding = get_encoding(path, nbytes=ENCODING_GUESS_BYTES)

    enc = py_to_nim_encoding(encoding)
    pid = Config.workdir / Config.pid
    kwargs = {}

    if first_row_has_headers is not None:
        kwargs["first_row_has_headers"] = first_row_has_headers
    if header_row_index is not None:
        kwargs["header_row_index"] = header_row_index
    if columns is not None:
        kwargs["columns"] = columns
    if start is not None:
        kwargs["start"] = start
    if limit is not None and limit != sys.maxsize:
        kwargs["limit"] = limit
    if guess_datatypes is not None:
        kwargs["guess_datatypes"] = guess_datatypes
    if newline is not None:
        kwargs["newline"] = newline
    if delimiter is not None:
        kwargs["delimiter"] = delimiter
    if text_qualifier is not None:
        kwargs["text_qualifier"] = text_qualifier
        kwargs["quoting"] = "QUOTE_MINIMAL"
    else:
        kwargs["quoting"] = "QUOTE_NONE"
    if strip_leading_and_tailing_whitespace is not None:
        kwargs["strip_leading_and_tailing_whitespace"] = strip_leading_and_tailing_whitespace

    if skip_empty is None:
        kwargs["skip_empty"] = "NONE"
    else:
        kwargs["skip_empty"] = skip_empty

    return nimlite.text_reader(
        T, pid, path, enc,
        **kwargs,
        tqdm=tqdm
    )

Modules