Merge CSVs

from glob import glob
import os

import polars as pl
pl.Config(tbl_rows=10)
<polars.config.Config at 0x110576610>

1 glob CSVs in a folder

csv_dir = "./data/csv/"
csv_path_glob = os.path.join(csv_dir, '*.csv')
csv_list = glob(csv_path_glob)

for idx, csv in enumerate(csv_list):
    print('csv ', idx, ' : ', csv)
csv  0  :  ./data/csv/cn_zone_1_08_updated.csv
csv  1  :  ./data/csv/cn_zone_1_04_updated.csv
csv  2  :  ./data/csv/cn_zone_1_07_updated.csv
csv  3  :  ./data/csv/cn_zone_1_05_updated.csv
csv  4  :  ./data/csv/cn_zone_1_06_updated.csv

2 merge CSVs into a df

%%time

df_merged = pl.read_csv(csv_path_glob)

# q = (
#     pl.scan_csv(csv_path_glob)
# )
# df_merged = q.collect()

df_merged
1
Use globbing patterns to read multiple files into a single DF.
CPU times: user 3.07 ms, sys: 15.6 ms, total: 18.7 ms
Wall time: 22.3 ms
shape: (436, 22)
string_idx string_name string_name_x string_name_y string_box_idx CB_name Phase string_num string_box_v1_x string_box_v1_y string_box_v2_x string_box_v2_y string_box_v3_x string_box_v3_y string_box_v4_x string_box_v4_y num_modules module_layout dist_btw_v0_v1 dist_btw_v1_v2 num_module_along_x num_module_along_y
i64 str f64 f64 i64 i64 i64 i64 f64 f64 f64 f64 f64 f64 f64 f64 i64 str f64 f64 i64 i64
0 "1-4-NB05-10" 4371.812002 2242.231026 0 0 0 0 4354.626954 2242.689021 4354.626954 2241.77303 4388.997049 2241.77303 4388.997049 2242.689021 14 "L" 0.916 34.37 1 14
0 "1-4-NB05-10" 4371.812002 2243.227919 1 0 0 0 4354.626954 2243.685915 4354.626954 2242.769924 4388.997049 2242.769924 4388.997049 2243.685915 14 "L" 0.916 34.37 1 14
1 "1-4-NB05-9" 4371.812002 2244.224813 0 0 0 0 4354.626954 2244.682809 4354.626954 2243.766818 4388.997049 2243.766818 4388.997049 2244.682809 14 "L" 0.916 34.37 1 14
1 "1-4-NB05-9" 4371.812002 2245.221707 1 0 0 0 4354.626954 2245.679702 4354.626954 2244.763711 4388.997049 2244.763711 4388.997049 2245.679702 14 "L" 0.916 34.37 1 14
2 "1-4-NB05-12" 4406.79557 2242.091945 0 0 0 0 4389.610523 2242.549941 4389.610523 2241.63395 4423.980618 2241.63395 4423.980618 2242.549941 14 "L" 0.916 34.37 1 14
22 "1-8-NB02-3" 4478.810262 2464.772732 1 0 0 0 4491.035789 2464.304492 4491.035789 2465.244649 4466.584735 2465.244649 4466.584735 2464.29714 10 "L" 0.94 24.451 1 10
22 "1-8-NB02-3" 4478.810262 2465.766136 2 0 0 0 4491.035789 2465.297896 4491.035789 2466.238052 4466.584735 2466.238052 4466.584735 2465.290544 10 "L" 0.94 24.451 1 10
23 "1-8-NB02-4" 4503.839598 2463.755058 0 0 0 0 4516.065125 2463.286818 4516.065125 2464.226974 4491.614072 2464.226974 4491.614072 2463.279466 10 "L" 0.94 24.451 1 10
23 "1-8-NB02-4" 4503.839598 2464.748462 1 0 0 0 4516.065125 2464.280222 4516.065125 2465.220378 4491.614072 2465.220378 4491.614072 2464.27287 10 "L" 0.94 24.451 1 10
23 "1-8-NB02-4" 4503.839598 2465.741866 2 0 0 0 4516.065125 2465.273626 4516.065125 2466.213782 4491.614072 2466.213782 4491.614072 2465.266274 10 "L" 0.94 24.451 1 10

3 write the df as a merged CSV

csv_merged_path = os.path.join("./data/csv_merged/", "cn_zone_1_04-08_merged.csv")
df_merged.write_csv(csv_merged_path)
print('...merged csv written.')
df_merged
...merged csv written.
shape: (436, 22)
string_idx string_name string_name_x string_name_y string_box_idx CB_name Phase string_num string_box_v1_x string_box_v1_y string_box_v2_x string_box_v2_y string_box_v3_x string_box_v3_y string_box_v4_x string_box_v4_y num_modules module_layout dist_btw_v0_v1 dist_btw_v1_v2 num_module_along_x num_module_along_y
i64 str f64 f64 i64 i64 i64 i64 f64 f64 f64 f64 f64 f64 f64 f64 i64 str f64 f64 i64 i64
0 "1-4-NB05-10" 4371.812002 2242.231026 0 0 0 0 4354.626954 2242.689021 4354.626954 2241.77303 4388.997049 2241.77303 4388.997049 2242.689021 14 "L" 0.916 34.37 1 14
0 "1-4-NB05-10" 4371.812002 2243.227919 1 0 0 0 4354.626954 2243.685915 4354.626954 2242.769924 4388.997049 2242.769924 4388.997049 2243.685915 14 "L" 0.916 34.37 1 14
1 "1-4-NB05-9" 4371.812002 2244.224813 0 0 0 0 4354.626954 2244.682809 4354.626954 2243.766818 4388.997049 2243.766818 4388.997049 2244.682809 14 "L" 0.916 34.37 1 14
1 "1-4-NB05-9" 4371.812002 2245.221707 1 0 0 0 4354.626954 2245.679702 4354.626954 2244.763711 4388.997049 2244.763711 4388.997049 2245.679702 14 "L" 0.916 34.37 1 14
2 "1-4-NB05-12" 4406.79557 2242.091945 0 0 0 0 4389.610523 2242.549941 4389.610523 2241.63395 4423.980618 2241.63395 4423.980618 2242.549941 14 "L" 0.916 34.37 1 14
22 "1-8-NB02-3" 4478.810262 2464.772732 1 0 0 0 4491.035789 2464.304492 4491.035789 2465.244649 4466.584735 2465.244649 4466.584735 2464.29714 10 "L" 0.94 24.451 1 10
22 "1-8-NB02-3" 4478.810262 2465.766136 2 0 0 0 4491.035789 2465.297896 4491.035789 2466.238052 4466.584735 2466.238052 4466.584735 2465.290544 10 "L" 0.94 24.451 1 10
23 "1-8-NB02-4" 4503.839598 2463.755058 0 0 0 0 4516.065125 2463.286818 4516.065125 2464.226974 4491.614072 2464.226974 4491.614072 2463.279466 10 "L" 0.94 24.451 1 10
23 "1-8-NB02-4" 4503.839598 2464.748462 1 0 0 0 4516.065125 2464.280222 4516.065125 2465.220378 4491.614072 2465.220378 4491.614072 2464.27287 10 "L" 0.94 24.451 1 10
23 "1-8-NB02-4" 4503.839598 2465.741866 2 0 0 0 4516.065125 2465.273626 4516.065125 2466.213782 4491.614072 2466.213782 4491.614072 2465.266274 10 "L" 0.94 24.451 1 10

4 (lazy-)filter a DF by a list of items in a column

%%time

strings_to_filter = [
    '1-4-NB05-10',
    '1-4-NB05-9',
    '1-8-NB02-4',
]

query = (
    pl.scan_csv(csv_path_glob)
      .filter(pl.col('string_name')
      .is_in(strings_to_filter))
)

df_filtered = query.collect()

df_filtered
2
Build a query(s) in lazy mode
3
execute the query(s) when its collected
Back to top