Benchmarks

Benchmarks¶

These benchmarks seek to establish the performance of tablite as a user sees it.

Overview

Input/Output	Various column functions	Base functions	Core functions
- Save / Load .tpz format - Save tables to various formats - Import data from various formats	- Setitem / getitem - iter - equal, not equal - copy - t += t - t *= t - contains - remove all - replace - index - unique - histogram - statistics - count	- Setitem / getitem - iter / rows - equal, not equal - load - save - copy - stack - types - display_dict - show - to_dict - as_json_serializable - index	- expression - filter - sort_index - reindex - drop_duplicates - sort - is_sorted - any - all - drop - replace - groupby - pivot - joins - lookup - replace missing values - transpose - pivot_transpose - diff

In [2]:

Copied!





import psutil, os, gc, shutil, tempfile
from pathlib import Path
from time import perf_counter, time
from tablite import Table
from tablite.datasets import synthetic_order_data
from tablite.config import Config

Config.TQDM_DISABLE = True
import psutil, os, gc, shutil, tempfile
from pathlib import Path
from time import perf_counter, time
from tablite import Table
from tablite.datasets import synthetic_order_data
from tablite.config import Config

Config.TQDM_DISABLE = True

Input / Output¶

Create tables from synthetic data.¶

In [3]:

Copied!





process = psutil.Process(os.getpid())

def make_tables(sizes=[1,2,5,10,20,50]):
    # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.
    t = synthetic_order_data(Config.PAGE_SIZE)
    real, flat = t.nbytes()
    print(f"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk")

    tables = [t]  # 1M rows.

    last = 1
    t2 = t.copy()
    for i in sizes[1:]:
        t2 = t2.copy()
        for _ in range(i-last):
            t2 += synthetic_order_data(Config.PAGE_SIZE)  # these are all unique
        last = i
        real, flat = t2.nbytes()
        tables.append(t2)
        print(f"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk")
    return tables

tables = make_tables()
process = psutil.Process(os.getpid())

def make_tables(sizes=[1,2,5,10,20,50]):
    # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.
    t = synthetic_order_data(Config.PAGE_SIZE)
    real, flat = t.nbytes()
    print(f"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk")

    tables = [t]  # 1M rows.

    last = 1
    t2 = t.copy()
    for i in sizes[1:]:
        t2 = t2.copy()
        for _ in range(i-last):
            t2 += synthetic_order_data(Config.PAGE_SIZE)  # these are all unique
        last = i
        real, flat = t2.nbytes()
        tables.append(t2)
        print(f"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk")
    return tables

tables = make_tables()

Table 1,000,000 rows is 256 Mb on disk
Table 2,000,000 rows is 512 Mb on disk
Table 5,000,000 rows is 1,280 Mb on disk
Table 10,000,000 rows is 2,560 Mb on disk
Table 20,000,000 rows is 5,120 Mb on disk
Table 50,000,000 rows is 12,800 Mb on disk

The values in the tables above are all unique!

In [4]:

Copied!

tables[-1]
tables[-1]

Out[4]:

~	#	1	2	3	4	5	6	7	8	9	10	11
0	1	1401495318295	2021-10-06T00:00:00	50814	1	19375	C3-4	HGQ	21°	XYZ	1.2446472682017344	21.367107051830455
1	2	932023137218	2021-08-26T00:00:00	50077	1	8568	C5-5	FZU	0°		0.5529448534751613	2.6980406874392537
2	3	1256960225081	2021-12-21T00:00:00	50197	0	29074	C2-3	GTK	6°	XYZ	1.997397545590656	17.513164305723787
3	4	1401277781743	2021-08-23T00:00:00	50818	0	24969	C4-3	BYP	6°	XYZ	0.04749712553828957	7.388171617130485
4	5	942666767426	2021-07-31T00:00:00	50307	1	13074	C5-2	CCC	21°	ABC	1.021921502761288	5.21324123446987
5	6	1218613185127	2021-12-01T00:00:00	50484	1	17249	C5-4	WGT	21°		0.20387642584345567	12.190974436133764
6	7	607042434398	2021-11-29T00:00:00	50578	0	11564	C2-3	LUL	0°	XYZ	2.236783515848044	4.340628097363572
...	...	...	...	...	...	...	...	...	...	...	...	...
49,999,993	999994	660269377547	2021-09-17T00:00:00	50154	0	9706	C4-3	AHQ	21°	XYZ	0.08321664584312585	6.567802977527905
49,999,994	999995	570979864695	2021-08-01T00:00:00	50149	1	25006	C1-2	FWH	6°		1.0476392366226641	9.507105444627065
49,999,995	999996	355195607825	2021-07-29T00:00:00	50007	0	26992	C4-3	GVG	21°		2.204408165609414	11.27064439742849
49,999,996	999997	2076224057728	2021-10-16T00:00:00	50950	1	13339	C5-4	NKS	0°		2.159311049813549	4.215756200465961
49,999,997	999998	657724789135	2021-12-21T00:00:00	50069	1	14747	C2-4	LYG	None		1.6480964019169868	3.0944204836258273
49,999,998	999999	977531243884	2021-12-02T00:00:00	50644	1	29345	C2-5	DRH	6°		2.309114216927531	10.827068672071468
49,999,999	1000000	1229071392065	2021-08-23T00:00:00	50706	1	19732	C4-5	AGB	6°		0.48887140559369163	0.8580085696389939

Save / Load .tpz format¶

Without default compression settings (10% slower than uncompressed, 20% of uncompressed filesize)

In [5]:

Copied!





def save_load_benchmarks(tables):
    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)

    results = Table()
    results.add_columns('rows', 'save (sec)', 'load (sec)')
    for t in tables:
        fn = tmp / f'{len(t)}.tpz'
        start = perf_counter()
        t.save(fn)
        end = perf_counter()
        save = round(end-start,3)
        assert fn.exists()
        
        
        start = perf_counter()
        t2 = Table.load(fn)
        end = perf_counter()
        load = round(end-start,3)
        print(f"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds")
        del t2
        fn.unlink()
        results.add_rows(len(t), save, load)
    
    r = results
    r['save r/sec'] = [int(a/b) if b!=0  else "nil" for a,b in zip(r['rows'], r['save (sec)']) ]
    r['load r/sec'] = [int(a/b) if b!=0  else "nil" for a,b in zip(r['rows'], r['load (sec)'])]

    return results
def save_load_benchmarks(tables):
    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)

    results = Table()
    results.add_columns('rows', 'save (sec)', 'load (sec)')
    for t in tables:
        fn = tmp / f'{len(t)}.tpz'
        start = perf_counter()
        t.save(fn)
        end = perf_counter()
        save = round(end-start,3)
        assert fn.exists()
        
        
        start = perf_counter()
        t2 = Table.load(fn)
        end = perf_counter()
        load = round(end-start,3)
        print(f"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds")
        del t2
        fn.unlink()
        results.add_rows(len(t), save, load)
    
    r = results
    r['save r/sec'] = [int(a/b) if b!=0  else "nil" for a,b in zip(r['rows'], r['save (sec)']) ]
    r['load r/sec'] = [int(a/b) if b!=0  else "nil" for a,b in zip(r['rows'], r['load (sec)'])]

    return results

In [6]:

Copied!

slb = save_load_benchmarks(tables)
slb = save_load_benchmarks(tables)

saving 1,000,000 rows (49 Mb) took 2.148 seconds. loading took 0.922 seconds
saving 2,000,000 rows (98 Mb) took 4.267 seconds. loading took 1.820 seconds
saving 5,000,000 rows (246 Mb) took 10.618 seconds. loading took 4.482 seconds
saving 10,000,000 rows (492 Mb) took 21.291 seconds. loading took 8.944 seconds
saving 20,000,000 rows (984 Mb) took 42.603 seconds. loading took 17.821 seconds
saving 50,000,000 rows (2,461 Mb) took 106.644 seconds. loading took 44.600 seconds

In [7]:

Copied!

slb
slb

Out[7]:

#	rows	save (sec)	load (sec)	save r/sec	load r/sec
0	1000000	2.148	0.922	465549	1084598
1	2000000	4.267	1.82	468713	1098901
2	5000000	10.618	4.482	470898	1115573
3	10000000	21.291	8.944	469682	1118067
4	20000000	42.603	17.821	469450	1122271
5	50000000	106.644	44.6	468849	1121076

With various compression options

In [8]:

Copied!





def save_compression_benchmarks(t):
    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)

    import zipfile  # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile
    methods = [(None, zipfile.ZIP_STORED, "zip stored"), (None, zipfile.ZIP_LZMA, "zip lzma")]
    methods += [(i, zipfile.ZIP_DEFLATED, "zip deflated") for i in range(0,10)]
    methods += [(i, zipfile.ZIP_BZIP2, "zip bzip2") for i in range(1,10)]

    results = Table()
    results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')
    for level, method, name in methods:
        fn = tmp / f'{len(t)}.tpz'
        start = perf_counter()  
        t.save(fn, compression_method=method, compression_level=level)
        end = perf_counter()
        write = round(end-start,3)
        assert fn.exists()
        size = int(fn.stat().st_size/1e6)
        # print(f"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save", end='')
        
        start = perf_counter()
        t2 = Table.load(fn)
        end = perf_counter()
        read = round(end-start,3)
        # print(f" and {end-start:,.3} seconds to load")
        print(".", end='')
        
        del t2
        fn.unlink()
        results.add_rows(size, f"{name}(level={level})", write, read)
        
    
    r = results
    r.sort({'write (sec)':True})
    r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]
    r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]
    return results
def save_compression_benchmarks(t):
    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)

    import zipfile  # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile
    methods = [(None, zipfile.ZIP_STORED, "zip stored"), (None, zipfile.ZIP_LZMA, "zip lzma")]
    methods += [(i, zipfile.ZIP_DEFLATED, "zip deflated") for i in range(0,10)]
    methods += [(i, zipfile.ZIP_BZIP2, "zip bzip2") for i in range(1,10)]

    results = Table()
    results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')
    for level, method, name in methods:
        fn = tmp / f'{len(t)}.tpz'
        start = perf_counter()  
        t.save(fn, compression_method=method, compression_level=level)
        end = perf_counter()
        write = round(end-start,3)
        assert fn.exists()
        size = int(fn.stat().st_size/1e6)
        # print(f"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save", end='')
        
        start = perf_counter()
        t2 = Table.load(fn)
        end = perf_counter()
        read = round(end-start,3)
        # print(f" and {end-start:,.3} seconds to load")
        print(".", end='')
        
        del t2
        fn.unlink()
        results.add_rows(size, f"{name}(level={level})", write, read)
        
    
    r = results
    r.sort({'write (sec)':True})
    r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]
    r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]
    return results

In [9]:

Copied!

scb = save_compression_benchmarks(tables[0])
scb = save_compression_benchmarks(tables[0])

.....................

creating sort index:   0%|          | 0/1 [00:00<?, ?it/s]
creating sort index: 100%|██████████| 1/1 [00:00<00:00, 268.92it/s]

In [10]:

Copied!

scb[0:20]
scb[0:20]

Out[10]:

#	file size (Mb)	method	write (sec)	read (sec)	write (rps)	read (rps)
0	256	zip stored(level=None)	0.396	0.475	2525252	2105263
1	29	zip lzma(level=None)	95.137	2.228	10511	448833
2	256	zip deflated(level=0)	0.535	0.595	1869158	1680672
3	49	zip deflated(level=1)	2.15	0.922	465116	1084598
4	47	zip deflated(level=2)	2.264	0.912	441696	1096491
5	43	zip deflated(level=3)	3.049	0.83	327976	1204819
6	44	zip deflated(level=4)	2.92	0.862	342465	1160092
7	42	zip deflated(level=5)	4.034	0.869	247892	1150747
8	40	zip deflated(level=6)	8.558	0.8	116849	1250000
9	39	zip deflated(level=7)	13.695	0.778	73019	1285347
10	38	zip deflated(level=8)	56.972	0.792	17552	1262626
11	38	zip deflated(level=9)	122.623	0.791	8155	1264222
12	29	zip bzip2(level=1)	15.121	4.065	66133	246002
13	29	zip bzip2(level=2)	16.047	4.214	62316	237304
14	29	zip bzip2(level=3)	16.858	4.409	59319	226808
15	29	zip bzip2(level=4)	17.648	5.141	56663	194514
16	29	zip bzip2(level=5)	18.674	6.009	53550	166417
17	29	zip bzip2(level=6)	19.405	6.628	51533	150875
18	29	zip bzip2(level=7)	19.954	6.714	50115	148942
19	29	zip bzip2(level=8)	20.595	6.961	48555	143657

Conclusions

Fastest: zip stored with no compression takes handles

Save / load tables to / from various formats¶

The handlers for saving / export are:

to_sql
to_json
to_xls
to_ods
to_csv
to_tsv
to_text
to_html
to_hdf5

In [11]:

Copied!





def to_sql_benchmark(t, rows=1_000_000):
    t2 = t[:rows]
    write_start = time()
    _ = t2.to_sql(name='1')
    write_end = time()
    write = round(write_end-write_start,3)
    return ( t.to_sql.__name__, write, 0, len(t2), "" , "" )
def to_sql_benchmark(t, rows=1_000_000):
    t2 = t[:rows]
    write_start = time()
    _ = t2.to_sql(name='1')
    write_end = time()
    write = round(write_end-write_start,3)
    return ( t.to_sql.__name__, write, 0, len(t2), "" , "" ) 

In [12]:

Copied!





def to_json_benchmark(t, rows=1_000_000):
    t2 = t[:rows]

    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)
    path = tmp / "1.json" 
    
    write_start = time()
    bytestr = t2.to_json()
    with path.open('w') as fo:
        fo.write(bytestr)
    write_end = time()
    write = round(write_end-write_start,3)

    read_start = time()
    with path.open('r') as fi:
        _ = Table.from_json(fi.read())  # <-- JSON
    read_end = time()
    read = round(read_end-read_start,3)

    return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), "" )
def to_json_benchmark(t, rows=1_000_000):
    t2 = t[:rows]

    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)
    path = tmp / "1.json" 
    
    write_start = time()
    bytestr = t2.to_json()
    with path.open('w') as fo:
        fo.write(bytestr)
    write_end = time()
    write = round(write_end-write_start,3)

    read_start = time()
    with path.open('r') as fi:
        _ = Table.from_json(fi.read())  # <-- JSON
    read_end = time()
    read = round(read_end-read_start,3)

    return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), "" ) 

In [13]:

Copied!





def f(t, args):
    rows, c1, c1_kw, c2, c2_kw = args
    t2 = t[:rows]

    call = getattr(t2, c1)
    assert callable(call)

    write_start = time()
    call(**c1_kw)
    write_end = time()
    write = round(write_end-write_start,3)

    for _ in range(10):
        gc.collect()

    read_start = time()
    if callable(c2):
        c2(**c2_kw)
    read_end = time()
    read = round(read_end-read_start,3)

    fn = c2_kw['path']
    assert fn.exists()
    fs = int(fn.stat().st_size/1e6)
    config = {k:v for k,v in c2_kw.items() if k!= 'path'}

    return ( c1, write, read, len(t2), fs , str(config))
def f(t, args):
    rows, c1, c1_kw, c2, c2_kw = args
    t2 = t[:rows]

    call = getattr(t2, c1)
    assert callable(call)

    write_start = time()
    call(**c1_kw)
    write_end = time()
    write = round(write_end-write_start,3)

    for _ in range(10):
        gc.collect()

    read_start = time()
    if callable(c2):
        c2(**c2_kw)
    read_end = time()
    read = round(read_end-read_start,3)

    fn = c2_kw['path']
    assert fn.exists()
    fs = int(fn.stat().st_size/1e6)
    config = {k:v for k,v in c2_kw.items() if k!= 'path'}

    return ( c1, write, read, len(t2), fs , str(config))

In [14]:

Copied!





def import_export_benchmarks(tables):
    Config.PROCESSING_MODE = Config.FALSE
        
    t = sorted(tables, key=lambda x: len(x), reverse=True)[0]
    
    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)   

    args = [
        (   100_000, "to_xlsx", {'path': tmp/'1.xlsx'}, Table.from_file, {"path":tmp/'1.xlsx', "sheet":"pyexcel_sheet1"}),
        (    50_000,  "to_ods",  {'path': tmp/'1.ods'}, Table.from_file, {"path":tmp/'1.ods', "sheet":"pyexcel_sheet1"} ),  # 50k rows, otherwise MemoryError.
        ( 1_000_000,  "to_csv",  {'path': tmp/'1.csv'}, Table.from_file, {"path":tmp/'1.csv'}                           ),
        ( 1_000_000,  "to_csv",  {'path': tmp/'1.csv'}, Table.from_file, {"path":tmp/'1.csv', "guess_datatypes":False}),
        (10_000_000,  "to_csv",  {'path': tmp/'1.csv'}, Table.from_file, {"path":tmp/'1.csv', "guess_datatypes":False}),
        ( 1_000_000,  "to_tsv",  {'path': tmp/'1.tsv'}, Table.from_file, {"path":tmp/'1.tsv'}                           ),
        ( 1_000_000, "to_text",  {'path': tmp/'1.txt'}, Table.from_file, {"path":tmp/'1.txt'}                           ),
        ( 1_000_000, "to_html", {'path': tmp/'1.html'}, Table.from_file, {"path":tmp/'1.html'}                          ),
        ( 1_000_000, "to_hdf5", {'path': tmp/'1.hdf5'}, Table.from_file, {"path":tmp/'1.hdf5'}                          )
    ]

    results = Table()
    results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')

    results.add_rows( to_sql_benchmark(t) )
    results.add_rows( to_json_benchmark(t) )

    for arg in args:
        if len(t)<arg[0]:
            continue
        print(".", end='')
        try:
            results.add_rows( f(t, arg) )
        except MemoryError:
            results.add_rows( arg[1], "Memory Error", "NIL", args[0], "NIL", "N/A")
    
    r = results
    r['read r/sec'] = [int(a/b) if b!=0  else "nil" for a,b in zip(r['rows'], r['read (s)']) ]
    r['write r/sec'] = [int(a/b) if b!=0  else "nil" for a,b in zip(r['rows'], r['write (s)'])]

    shutil.rmtree(tmp)
    return results
def import_export_benchmarks(tables):
    Config.PROCESSING_MODE = Config.FALSE
        
    t = sorted(tables, key=lambda x: len(x), reverse=True)[0]
    
    tmp = Path(tempfile.gettempdir()) / "junk"
    tmp.mkdir(exist_ok=True)   

    args = [
        (   100_000, "to_xlsx", {'path': tmp/'1.xlsx'}, Table.from_file, {"path":tmp/'1.xlsx', "sheet":"pyexcel_sheet1"}),
        (    50_000,  "to_ods",  {'path': tmp/'1.ods'}, Table.from_file, {"path":tmp/'1.ods', "sheet":"pyexcel_sheet1"} ),  # 50k rows, otherwise MemoryError.
        ( 1_000_000,  "to_csv",  {'path': tmp/'1.csv'}, Table.from_file, {"path":tmp/'1.csv'}                           ),
        ( 1_000_000,  "to_csv",  {'path': tmp/'1.csv'}, Table.from_file, {"path":tmp/'1.csv', "guess_datatypes":False}),
        (10_000_000,  "to_csv",  {'path': tmp/'1.csv'}, Table.from_file, {"path":tmp/'1.csv', "guess_datatypes":False}),
        ( 1_000_000,  "to_tsv",  {'path': tmp/'1.tsv'}, Table.from_file, {"path":tmp/'1.tsv'}                           ),
        ( 1_000_000, "to_text",  {'path': tmp/'1.txt'}, Table.from_file, {"path":tmp/'1.txt'}                           ),
        ( 1_000_000, "to_html", {'path': tmp/'1.html'}, Table.from_file, {"path":tmp/'1.html'}                          ),
        ( 1_000_000, "to_hdf5", {'path': tmp/'1.hdf5'}, Table.from_file, {"path":tmp/'1.hdf5'}                          )
    ]

    results = Table()
    results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')

    results.add_rows( to_sql_benchmark(t) )
    results.add_rows( to_json_benchmark(t) )

    for arg in args:
        if len(t)

In [15]:

Copied!

ieb = import_export_benchmarks(tables)
ieb = import_export_benchmarks(tables)

.........writing 12,000,000 records to /tmp/junk/1.hdf5... done

In [16]:

Copied!

ieb
ieb

Out[16]:

#	method	write (s)	read (s)	rows	size (Mb)	config	read r/sec	write r/sec
0	to_sql	12.345	0	1000000			nil	81004
1	to_json	10.814	4.406	1000000	142		226963	92472
2	to_xlsx	10.569	21.572	100000	9	{'sheet': 'pyexcel_sheet1'}	4635	9461
3	to_ods	29.175	29.487	50000	3	{'sheet': 'pyexcel_sheet1'}	1695	1713
4	to_csv	14.315	15.731	1000000	108	{}	63568	69856
5	to_csv	14.438	8.169	1000000	108	{'guess_datatypes': False}	122414	69261
6	to_csv	140.645	99.45	10000000	1080	{'guess_datatypes': False}	100553	71100
7	to_tsv	13.834	15.763	1000000	108	{}	63439	72285
8	to_text	13.937	15.682	1000000	108	{}	63767	71751
9	to_html	12.578	0.53	1000000	228	{}	1886792	79503
10	to_hdf5	5.01	12.345	1000000	316	{}	81004	199600

Conclusions

Best:

to/from JSON wins with 2.3M rps read
to/from CSV/TSV/TEXT comes 2nd with config guess_datatypes=False with ~ 100k rps

Worst:

to/from ods burst the memory footprint and hence had to be reduced to 100k rows. It also had the slowest read rate with 1450 rps.

Various column functions¶

Setitem / getitem
iter
equal, not equal
copy
t += t
t *= t
contains
remove all
replace
index
unique
histogram
statistics
count

In [17]:

Copied!





def contains_benchmark(table):
    results = Table()
    results.add_columns( "column", "time (s)" )
    for name,col in table.columns.items():
        n = len(col)
        start,stop,step = int(n*0.02), int(n*0.98), int(n/100)
        selection = col[start:stop:step]
        total_time = 0.0
        for v in selection:
            start_time = perf_counter()
            v in col  # <--- test!
            end_time = perf_counter()
            total_time += (end_time - start_time)
        avg_time = total_time / len(selection)
        results.add_rows( name, round(avg_time,3) )

    return results
def contains_benchmark(table):
    results = Table()
    results.add_columns( "column", "time (s)" )
    for name,col in table.columns.items():
        n = len(col)
        start,stop,step = int(n*0.02), int(n*0.98), int(n/100)
        selection = col[start:stop:step]
        total_time = 0.0
        for v in selection:
            start_time = perf_counter()
            v in col  # <--- test!
            end_time = perf_counter()
            total_time += (end_time - start_time)
        avg_time = total_time / len(selection)
        results.add_rows( name, round(avg_time,3) )

    return results
    

In [18]:

Copied!

has_it = contains_benchmark(tables[-1])
has_it
has_it = contains_benchmark(tables[-1])
has_it

Out[18]:

#	column	time (s)
0	#	0.001
1	1	0.043
2	2	0.032
3	3	0.001
4	4	0.001
5	5	0.001
6	6	0.006
7	7	0.003
8	8	0.006
9	9	0.007
10	10	0.043
11	11	0.655

Various table functions¶

Slicing¶

Slicing operations are used in many places.

In [19]:

Copied!





def slicing_benchmark(table):
    n = len(table)
    start,stop,step = int(0.02*n), int(0.98*n), int(n / 20)  # from 2% to 98% in 20 large steps
    start_time = perf_counter()
    snip = table[start:stop:step]
    end_time = perf_counter()
    print(f"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec")
    return snip
def slicing_benchmark(table):
    n = len(table)
    start,stop,step = int(0.02*n), int(0.98*n), int(n / 20)  # from 2% to 98% in 20 large steps
    start_time = perf_counter()
    snip = table[start:stop:step]
    end_time = perf_counter()
    print(f"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec")
    return snip

In [20]:

Copied!

slice_it = slicing_benchmark(tables[-1])
slice_it = slicing_benchmark(tables[-1])

reading 50,000,000 rows to find 20 rows took 1.435 sec

In [22]:

Copied!





def column_selection_benchmark(tables):
    results = Table()
    results.add_columns( 'rows')
    results.add_columns(*[f"n cols={i}" for i,_ in enumerate(tables[0].columns,start=1)])

    for table in tables:
        rr = [len(table)]
        for ix, name in enumerate(table.columns):
            cols = list(table.columns)[:ix+1]
            start_time = perf_counter()
            table[cols]
            end_time = perf_counter()
            rr.append(f"{end_time-start_time:.5f}")
        results.add_rows( rr )
    return results
def column_selection_benchmark(tables):
    results = Table()
    results.add_columns( 'rows')
    results.add_columns(*[f"n cols={i}" for i,_ in enumerate(tables[0].columns,start=1)])

    for table in tables:
        rr = [len(table)]
        for ix, name in enumerate(table.columns):
            cols = list(table.columns)[:ix+1]
            start_time = perf_counter()
            table[cols]
            end_time = perf_counter()
            rr.append(f"{end_time-start_time:.5f}")
        results.add_rows( rr )
    return results

In [23]:

Copied!

csb = column_selection_benchmark(tables)
print("times below are are in seconds")
csb
csb = column_selection_benchmark(tables)
print("times below are are in seconds")
csb

times below are are in seconds

Out[23]:

#	rows	n cols=1	n cols=2	n cols=3	n cols=4	n cols=5	n cols=6	n cols=7	n cols=8	n cols=9	n cols=10	n cols=11	n cols=12
0	1000000	0.00001	0.00006	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004
1	2000000	0.00001	0.00008	0.00003	0.00003	0.00003	0.00003	0.00003	0.00003	0.00003	0.00003	0.00004	0.00004
2	5000000	0.00001	0.00005	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004	0.00004
3	10000000	0.00002	0.00005	0.00004	0.00004	0.00004	0.00004	0.00007	0.00005	0.00005	0.00005	0.00005	0.00005
4	20000000	0.00003	0.00006	0.00005	0.00005	0.00005	0.00005	0.00006	0.00006	0.00006	0.00006	0.00006	0.00006
5	50000000	0.00009	0.00011	0.00010	0.00009	0.00009	0.00009	0.00009	0.00009	0.00009	0.00009	0.00010	0.00009

In [33]:

Copied!





def iterrows_benchmark(table):
    results = Table()
    results.add_columns( 'n columns', 'time (s)')

    columns = ['1']
    for column in list(table.columns):
        columns.append(column)
        snip = table[columns, slice(500_000,1_500_000)]
        start_time = perf_counter()
        counts = 0
        for row in snip.rows:
            counts += 1
        end_time = perf_counter()
        results.add_rows( len(columns), round(end_time-start_time,3))

    return results
def iterrows_benchmark(table):
    results = Table()
    results.add_columns( 'n columns', 'time (s)')

    columns = ['1']
    for column in list(table.columns):
        columns.append(column)
        snip = table[columns, slice(500_000,1_500_000)]
        start_time = perf_counter()
        counts = 0
        for row in snip.rows:
            counts += 1
        end_time = perf_counter()
        results.add_rows( len(columns), round(end_time-start_time,3))

    return results

In [34]:

Copied!

iterb = iterrows_benchmark(tables[-1])
iterb
iterb = iterrows_benchmark(tables[-1])
iterb

Out[34]:

#	n columns	time (s)
0	2	9.951
1	3	9.816
2	4	9.859
3	5	9.93
4	6	9.985
5	7	9.942
6	8	9.958
7	9	9.867
8	10	9.96
9	11	9.932
10	12	9.83
11	13	9.861

In [35]:

Copied!

import matplotlib.pyplot as plt
plt.plot(iterb['n columns'], iterb['time (s)'])
plt.show()
import matplotlib.pyplot as plt
plt.plot(iterb['n columns'], iterb['time (s)'])
plt.show()

No description has been provided for this image

Table.types()¶

Table.types() is implemented for near constant speed lookup.

Here is an example:

In [28]:

Copied!

tables[-1].types()
tables[-1].types()

Out[28]:

{'#': {int: 50000000},
 '1': {int: 50000000},
 '2': {str: 50000000},
 '3': {int: 50000000},
 '4': {int: 50000000},
 '5': {int: 50000000},
 '6': {str: 50000000},
 '7': {str: 50000000},
 '8': {str: 50000000},
 '9': {str: 50000000},
 '10': {float: 50000000},
 '11': {str: 50000000}}

In [29]:

Copied!





def dtypes_benchmark(tables):
    dtypes_results = Table()
    dtypes_results.add_columns("rows", "time (s)")

    for table in tables:
        start_time = perf_counter()
        dt = table.types()
        end_time = perf_counter()
        assert isinstance(dt, dict) and len(dt) != 0
        dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )

    return dtypes_results
def dtypes_benchmark(tables):
    dtypes_results = Table()
    dtypes_results.add_columns("rows", "time (s)")

    for table in tables:
        start_time = perf_counter()
        dt = table.types()
        end_time = perf_counter()
        assert isinstance(dt, dict) and len(dt) != 0
        dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )

    return dtypes_results

In [30]:

Copied!

dtype_b = dtypes_benchmark(tables)
dtype_b
dtype_b = dtypes_benchmark(tables)
dtype_b

Out[30]:

#	rows	time (s)
0	1000000	0.0
1	2000000	0.0
2	5000000	0.0
3	10000000	0.0
4	20000000	0.0
5	50000000	0.001

Table.any¶

In [31]:

Copied!





def any_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))

    for table in tables:
        tmp = [len(table)]
        for column in list(table.columns):
            v = table[column][0]
            start_time = perf_counter()
            _ = table.any(**{column: v})
            end_time = perf_counter()           
            tmp.append(round(end_time-start_time,3))

        results.add_rows( tmp )
    return results
def any_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))

    for table in tables:
        tmp = [len(table)]
        for column in list(table.columns):
            v = table[column][0]
            start_time = perf_counter()
            _ = table.any(**{column: v})
            end_time = perf_counter()           
            tmp.append(round(end_time-start_time,3))

        results.add_rows( tmp )
    return results

In [32]:

Copied!

anyb = any_benchmark(tables)
anyb

anyb = any_benchmark(tables)
anyb

Out[32]:

~	rows	#	1	2	3	4	5	6	7	8	9	10	11
0	1000000	0.133	0.133	0.178	0.133	0.292	0.147	0.169	0.143	0.227	0.259	0.146	0.17
1	2000000	0.268	0.263	0.343	0.265	0.567	0.294	0.335	0.275	0.464	0.523	0.289	0.323
2	5000000	0.669	0.653	0.914	0.669	1.436	0.723	0.838	0.694	1.174	1.335	0.678	0.818
3	10000000	1.314	1.35	1.745	1.336	2.902	1.49	1.683	1.414	2.354	2.618	1.343	1.536
4	20000000	2.556	2.534	3.337	2.602	5.645	2.827	3.225	2.646	4.514	5.08	2.693	3.083
5	50000000	6.571	6.423	8.455	6.699	14.484	7.989	7.798	6.259	10.989	12.48	6.732	7.767

Table.all¶

In [36]:

Copied!





def all_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))

    for table in tables:
        tmp = [len(table)]
        for column in list(table.columns):
            v = table[column][0]
            start_time = perf_counter()
            _ = table.all(**{column: v})
            end_time = perf_counter()           
            tmp.append(round(end_time-start_time,3))

        results.add_rows( tmp )
    return results
def all_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))

    for table in tables:
        tmp = [len(table)]
        for column in list(table.columns):
            v = table[column][0]
            start_time = perf_counter()
            _ = table.all(**{column: v})
            end_time = perf_counter()           
            tmp.append(round(end_time-start_time,3))

        results.add_rows( tmp )
    return results

In [37]:

Copied!

allb = all_benchmark(tables)
allb
allb = all_benchmark(tables)
allb

Out[37]:

~	rows	#	1	2	3	4	5	6	7	8	9	10	11
0	1000000	0.12	0.121	0.162	0.122	0.264	0.138	0.155	0.127	0.209	0.237	0.133	0.151
1	2000000	0.237	0.235	0.311	0.238	0.52	0.266	0.297	0.341	0.451	0.53	0.261	0.285
2	5000000	0.675	0.698	0.952	0.594	1.605	0.659	0.812	0.719	1.224	1.353	0.664	0.914
3	10000000	1.314	1.332	1.707	1.332	3.091	1.463	1.781	1.366	2.358	2.638	1.409	1.714
4	20000000	2.576	2.313	3.11	2.396	5.207	2.573	2.921	2.403	4.041	4.658	2.463	2.808
5	50000000	5.896	5.82	7.73	5.956	12.909	7.45	7.27	5.981	10.183	11.576	6.372	7.414

Table.filter¶

In [ ]:

Table.unique¶

In [38]:

Copied!





def unique_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))
    
    for table in tables:
        length = len(table)

        tmp = [len(table)]
        for column in list(table.columns):
            start_time = perf_counter()
            try:
                L = table[column].unique()
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            tmp.append(round(dt,3))
            assert 0 < len(L) <= length    

        results.add_rows( tmp )
    return results
def unique_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))
    
    for table in tables:
        length = len(table)

        tmp = [len(table)]
        for column in list(table.columns):
            start_time = perf_counter()
            try:
                L = table[column].unique()
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            tmp.append(round(dt,3))
            assert 0 < len(L) <= length    

        results.add_rows( tmp )
    return results

In [39]:

Copied!

ubm = unique_benchmark(tables)
ubm
ubm = unique_benchmark(tables)
ubm

Out[39]:

~	rows	#	1	2	3	4	5	6	7	8	9	10	11
0	1000000	0.022	0.081	0.248	0.044	0.016	0.061	0.115	0.136	0.096	0.085	0.094	0.447
1	2000000	0.176	0.271	0.505	0.087	0.031	0.124	0.229	0.279	0.198	0.17	0.305	1.471
2	5000000	0.198	0.499	1.263	0.218	0.076	0.311	0.57	0.685	0.474	0.425	0.595	2.744
3	10000000	0.502	1.123	2.535	0.433	0.155	0.615	1.128	1.375	0.96	0.85	1.316	5.826
4	20000000	0.956	2.336	5.035	0.883	0.319	1.229	2.268	2.748	1.913	1.746	2.733	11.883
5	50000000	2.395	6.019	12.499	2.178	0.764	3.073	5.608	6.819	4.828	4.279	7.097	30.511

Table.index¶

Single column index first:

In [40]:

Copied!





def index_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))
    
    for table in tables:

        tmp = [len(table)]
        for column in list(table.columns):
            start_time = perf_counter()
            try:
                _ = table.index(column)
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            tmp.append(round(dt,3))
            
        results.add_rows( tmp )
    return results
def index_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))
    
    for table in tables:

        tmp = [len(table)]
        for column in list(table.columns):
            start_time = perf_counter()
            try:
                _ = table.index(column)
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            tmp.append(round(dt,3))
            
        results.add_rows( tmp )
    return results  

In [41]:

Copied!

ibm = index_benchmark(tables)
ibm
ibm = index_benchmark(tables)
ibm

Out[41]:

~	rows	#	1	2	3	4	5	6	7	8	9	10	11
0	1000000	1.949	1.793	1.432	1.106	1.051	1.23	1.338	1.493	1.411	1.303	1.999	2.325
1	2000000	2.883	3.517	2.856	2.217	2.124	2.462	2.676	2.986	2.709	2.606	4.049	4.461
2	5000000	6.382	9.049	7.096	5.628	5.353	6.312	6.649	7.521	6.71	6.459	10.27	10.747
3	10000000	12.553	18.506	13.95	11.335	10.724	12.509	13.33	15.051	13.502	12.899	19.769	21.999
4	20000000	24.717	37.896	28.568	22.666	21.472	26.327	27.157	30.064	27.332	25.822	38.311	43.399
5	50000000	63.016	97.077	72.007	55.609	54.099	61.797	68.236	75.07	69.022	66.152	99.183	109.969

Multi-column index next:

In [42]:

Copied!





def multi_column_index_benchmark(tables):
    
    selection = ["4", "7", "8", "9"]
    results = Table()
    results.add_columns("rows", *range(1,len(selection)+1))
    
    for table in tables:

        tmp = [len(table)]
        for index in range(1,5):
            start_time = perf_counter()
            try:
                _ = table.index(*selection[:index])
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            tmp.append(round(dt,3))
            print('.', end='')
            
        results.add_rows( tmp )
    return results
def multi_column_index_benchmark(tables):
    
    selection = ["4", "7", "8", "9"]
    results = Table()
    results.add_columns("rows", *range(1,len(selection)+1))
    
    for table in tables:

        tmp = [len(table)]
        for index in range(1,5):
            start_time = perf_counter()
            try:
                _ = table.index(*selection[:index])
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            tmp.append(round(dt,3))
            print('.', end='')
            
        results.add_rows( tmp )
    return results  

In [43]:

Copied!

mcib = multi_column_index_benchmark(tables)
mcib
mcib = multi_column_index_benchmark(tables)
mcib

........................

Out[43]:

#	rows	1	2	3	4
0	1000000	1.058	2.133	3.215	4.052
1	2000000	2.12	4.278	6.546	8.328
2	5000000	5.303	10.89	16.693	20.793
3	10000000	10.581	22.407	33.462	41.91
4	20000000	21.064	45.954	67.781	84.828
5	50000000	52.347	109.551	166.6	211.053

drop duplicates¶

In [44]:

Copied!





def drop_duplicates_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))
    
    for table in tables:
        result = [len(table)]
        cols = []
        for name in list(table.columns):
            cols.append(name)
            start_time = perf_counter()
            try:
                _ = table.drop_duplicates(*cols)
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            result.append(round(dt,3))
            print('.', end='')
        
        results.add_rows( result )
    return results
def drop_duplicates_benchmark(tables):
    results = Table()
    results.add_columns("rows", *list(tables[0].columns))
    
    for table in tables:
        result = [len(table)]
        cols = []
        for name in list(table.columns):
            cols.append(name)
            start_time = perf_counter()
            try:
                _ = table.drop_duplicates(*cols)
                dt = perf_counter() - start_time
            except MemoryError:
                dt = -1
            result.append(round(dt,3))
            print('.', end='')
        
        results.add_rows( result )
    return results  

In [45]:

Copied!

ddb = drop_duplicates_benchmark(tables)
ddb
ddb = drop_duplicates_benchmark(tables)
ddb

........................................................................

Out[45]:

~	rows	#	1	2	3	4	5	6	7	8	9	10	11
0	1000000	1.761	2.358	3.313	3.901	4.615	4.961	5.835	6.534	7.454	8.108	8.803	9.682
1	2000000	3.011	4.93	6.934	7.97	9.264	10.268	12.006	13.517	14.92	16.631	17.932	19.493
2	5000000	6.827	13.853	18.637	21.237	24.548	27.11	31.157	35.026	38.992	43.531	46.022	50.433
3	10000000	13.238	31.746	41.141	46.917	53.172	58.241	67.992	74.651	82.74	91.458	97.666	104.82
4	20000000	25.932	77.75	100.34	109.314	123.514	131.874	148.432	163.57	179.121	196.047	208.686	228.059
5	50000000	64.237	312.222	364.886	388.249	429.724	466.685	494.418	535.367	581.666	607.306	634.343	683.858