Objectif

Quelle est le mode de compression par défaut des fichiers parquet ?

Quels sont les autres possibilités ?

Résultat :

Duckdb

odbc_dir <- "C:/Users/sebastien.li-thiao-t/Documents/FPIPJ_ltt/Agora odbc"
source_parquetname <- glue("{odbc_dir}/agora_SL5_DOSSIERS_2023S1_20230915.parquet")
gc(full = TRUE)
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1220673 65.2    2293490 122.5  2293490 122.5
## Vcells 2034860 15.6    8388608  64.0  3159434  24.2
con <- dbConnect(duckdb::duckdb())
dest <- "ParquetCompression_duckdb_default"
unlink(dest, recursive = TRUE)
dbExecute(con, glue::glue("COPY '{source_parquetname}' TO '{dest}' (FORMAT PARQUET, PER_THREAD_OUTPUT true)"))
## [1] 1218556
dir_info(dest) %>% summarise(sum(fs_bytes(size)))
## # A tibble: 1 × 1
##   `sum(fs_bytes(size))`
##             <fs::bytes>
## 1                  132M
for (algo in c("uncompressed","snappy","gzip","zstd")) {
  dest <- paste0("ParquetCompression_duckdb_",algo)
  unlink(dest, recursive = TRUE)
  tic(paste(algo,"write duration"))
  dbExecute(con, glue::glue("COPY '{source_parquetname}' TO '{dest}' (FORMAT PARQUET, COMPRESSION {algo}, PER_THREAD_OUTPUT true)"))
  toc()
  unlink("temp", recursive = TRUE)
  tic(paste(algo,"read duration"))
  dbExecute(con, glue::glue("COPY '{dest}/**/*.parquet' TO 'temp' (FORMAT PARQUET, PER_THREAD_OUTPUT true)"))
  toc()
  cat(format(dir_info(dest) %>% summarise(sum(fs_bytes(size))) %>% pull()),"\n")
}
## uncompressed write duration: 7.06 sec elapsed
## uncompressed read duration: 2.55 sec elapsed
## 606M 
## snappy write duration: 6.32 sec elapsed
## snappy read duration: 2.42 sec elapsed
## 132M 
## gzip write duration: 19.16 sec elapsed
## gzip read duration: 2.61 sec elapsed
## 71.5M 
## zstd write duration: 7.36 sec elapsed
## zstd read duration: 3.03 sec elapsed
## 72.3M

Clean-up

for (algo in c("uncompressed","snappy","gzip","zstd","default")) {
  dest <- paste0("ParquetCompression_duckdb_",algo)
  unlink(dest, recursive = TRUE)
}
unlink("temp", recursive = TRUE)
dbDisconnect(con, shutdown = TRUE)