Merge CSVs

from glob import glob
import os

import polars as pl
pl.Config(tbl_rows=10)

<polars.config.Config at 0x2ddccc57650>

1 glob CSVs in a folder

csv_dir = "./data/csv/"
csv_path_glob = os.path.join(csv_dir, '*.csv')
csv_list = glob(csv_path_glob)

for idx, csv in enumerate(csv_list):
    print('csv ', idx, ' : ', csv)

csv  0  :  ./data/csv\cn_zone_1_04_updated.csv
csv  1  :  ./data/csv\cn_zone_1_05_updated.csv
csv  2  :  ./data/csv\cn_zone_1_06_updated.csv
csv  3  :  ./data/csv\cn_zone_1_07_updated.csv
csv  4  :  ./data/csv\cn_zone_1_08_updated.csv

glob

2 merge CSVs into a df

%%time

df_merged = pl.read_csv(csv_path_glob)

# q = (
#     pl.scan_csv(csv_path_glob)
# )
# df_merged = q.collect()

df_merged

1: Use globbing patterns to read multiple files into a single DF.

CPU times: total: 109 ms
Wall time: 450 ms

shape: (436, 22)

string_idx	string_name	string_name_x	string_name_y	string_box_idx	CB_name	Phase	string_num	string_box_v1_x	string_box_v1_y	string_box_v2_x	string_box_v2_y	string_box_v3_x	string_box_v3_y	string_box_v4_x	string_box_v4_y	num_modules	module_layout	dist_btw_v0_v1	dist_btw_v1_v2	num_module_along_x	num_module_along_y
i64	str	f64	f64	i64	i64	i64	i64	f64	f64	f64	f64	f64	f64	f64	f64	i64	str	f64	f64	i64	i64
0	"1-4-NB05-10"	4371.812002	2242.231026	0	0	0	0	4354.626954	2242.689021	4354.626954	2241.77303	4388.997049	2241.77303	4388.997049	2242.689021	14	"L"	0.916	34.37	1	14
0	"1-4-NB05-10"	4371.812002	2243.227919	1	0	0	0	4354.626954	2243.685915	4354.626954	2242.769924	4388.997049	2242.769924	4388.997049	2243.685915	14	"L"	0.916	34.37	1	14
1	"1-4-NB05-9"	4371.812002	2244.224813	0	0	0	0	4354.626954	2244.682809	4354.626954	2243.766818	4388.997049	2243.766818	4388.997049	2244.682809	14	"L"	0.916	34.37	1	14
1	"1-4-NB05-9"	4371.812002	2245.221707	1	0	0	0	4354.626954	2245.679702	4354.626954	2244.763711	4388.997049	2244.763711	4388.997049	2245.679702	14	"L"	0.916	34.37	1	14
2	"1-4-NB05-12"	4406.79557	2242.091945	0	0	0	0	4389.610523	2242.549941	4389.610523	2241.63395	4423.980618	2241.63395	4423.980618	2242.549941	14	"L"	0.916	34.37	1	14
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
22	"1-8-NB02-3"	4478.810262	2464.772732	1	0	0	0	4491.035789	2464.304492	4491.035789	2465.244649	4466.584735	2465.244649	4466.584735	2464.29714	10	"L"	0.94	24.451	1	10
22	"1-8-NB02-3"	4478.810262	2465.766136	2	0	0	0	4491.035789	2465.297896	4491.035789	2466.238052	4466.584735	2466.238052	4466.584735	2465.290544	10	"L"	0.94	24.451	1	10
23	"1-8-NB02-4"	4503.839598	2463.755058	0	0	0	0	4516.065125	2463.286818	4516.065125	2464.226974	4491.614072	2464.226974	4491.614072	2463.279466	10	"L"	0.94	24.451	1	10
23	"1-8-NB02-4"	4503.839598	2464.748462	1	0	0	0	4516.065125	2464.280222	4516.065125	2465.220378	4491.614072	2465.220378	4491.614072	2464.27287	10	"L"	0.94	24.451	1	10
23	"1-8-NB02-4"	4503.839598	2465.741866	2	0	0	0	4516.065125	2465.273626	4516.065125	2466.213782	4491.614072	2466.213782	4491.614072	2465.266274	10	"L"	0.94	24.451	1	10

Dealing with multple files

Multiple

polars.read_csv

polars.scan_csv

3 write the df as a merged CSV

csv_merged_path = os.path.join("./data/csv_merged/", "cn_zone_1_04-08_merged.csv")
df_merged.write_csv(csv_merged_path)
print('...merged csv written.')
df_merged

...merged csv written.

shape: (436, 22)

string_idx	string_name	string_name_x	string_name_y	string_box_idx	CB_name	Phase	string_num	string_box_v1_x	string_box_v1_y	string_box_v2_x	string_box_v2_y	string_box_v3_x	string_box_v3_y	string_box_v4_x	string_box_v4_y	num_modules	module_layout	dist_btw_v0_v1	dist_btw_v1_v2	num_module_along_x	num_module_along_y
i64	str	f64	f64	i64	i64	i64	i64	f64	f64	f64	f64	f64	f64	f64	f64	i64	str	f64	f64	i64	i64
0	"1-4-NB05-10"	4371.812002	2242.231026	0	0	0	0	4354.626954	2242.689021	4354.626954	2241.77303	4388.997049	2241.77303	4388.997049	2242.689021	14	"L"	0.916	34.37	1	14
0	"1-4-NB05-10"	4371.812002	2243.227919	1	0	0	0	4354.626954	2243.685915	4354.626954	2242.769924	4388.997049	2242.769924	4388.997049	2243.685915	14	"L"	0.916	34.37	1	14
1	"1-4-NB05-9"	4371.812002	2244.224813	0	0	0	0	4354.626954	2244.682809	4354.626954	2243.766818	4388.997049	2243.766818	4388.997049	2244.682809	14	"L"	0.916	34.37	1	14
1	"1-4-NB05-9"	4371.812002	2245.221707	1	0	0	0	4354.626954	2245.679702	4354.626954	2244.763711	4388.997049	2244.763711	4388.997049	2245.679702	14	"L"	0.916	34.37	1	14
2	"1-4-NB05-12"	4406.79557	2242.091945	0	0	0	0	4389.610523	2242.549941	4389.610523	2241.63395	4423.980618	2241.63395	4423.980618	2242.549941	14	"L"	0.916	34.37	1	14
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
22	"1-8-NB02-3"	4478.810262	2464.772732	1	0	0	0	4491.035789	2464.304492	4491.035789	2465.244649	4466.584735	2465.244649	4466.584735	2464.29714	10	"L"	0.94	24.451	1	10
22	"1-8-NB02-3"	4478.810262	2465.766136	2	0	0	0	4491.035789	2465.297896	4491.035789	2466.238052	4466.584735	2466.238052	4466.584735	2465.290544	10	"L"	0.94	24.451	1	10
23	"1-8-NB02-4"	4503.839598	2463.755058	0	0	0	0	4516.065125	2463.286818	4516.065125	2464.226974	4491.614072	2464.226974	4491.614072	2463.279466	10	"L"	0.94	24.451	1	10
23	"1-8-NB02-4"	4503.839598	2464.748462	1	0	0	0	4516.065125	2464.280222	4516.065125	2465.220378	4491.614072	2465.220378	4491.614072	2464.27287	10	"L"	0.94	24.451	1	10
23	"1-8-NB02-4"	4503.839598	2465.741866	2	0	0	0	4516.065125	2465.273626	4516.065125	2466.213782	4491.614072	2466.213782	4491.614072	2465.266274	10	"L"	0.94	24.451	1	10

polars.DataFrame.write_csv

4 (lazy-)filter a DF by a list of items in a column

%%time

strings_to_filter = [
    '1-4-NB05-10',
    '1-4-NB05-9',
    '1-8-NB02-4',
]

query = (
    pl.scan_csv(csv_path_glob)
      .filter(pl.col('string_name')
      .is_in(strings_to_filter))
)

df_filtered = query.collect()

df_filtered

2: Build a query(s) in lazy mode
3: execute the query(s) when its collected

lazy/eager API

Lazy/eager API