Rdatatable · ben-schwen · Mar 15, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
@@ -209,20 +209,34 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
     }
 
     gzsig = FALSE
-    if ((w <- endsWithAny(file, c(".gz", ".bgz",".bz2"))) || (gzsig <- is_gzip(file_signature)) || is_bzip(file_signature)) {
-      if (!requireNamespace("R.utils", quietly = TRUE))
-        stopf("To read %s files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.", if (w<=2L || gzsig) "gz" else "bz2") # nocov
-      # not worth doing a behavior test here, so just use getRversion().
-      if (packageVersion("R.utils") < "2.13.0" && base::getRversion() >= "4.5.0")
-        stopf("Reading compressed files in fread requires R.utils version 2.13.0 or higher. Please upgrade R.utils.") # nocov
-      FUN = if (w<=2L || gzsig) gzfile else bzfile
+    zstdsig = FALSE
+    xzsig = FALSE
+    if ((w <- endsWithAny(file, c(".gz", ".bgz", ".bz2", ".zst", ".xz"))) || (gzsig <- is_gzip(file_signature)) || is_bzip(file_signature) ||
+        (zstdsig <- is_zstd(file_signature)) || (xzsig <- is_xz(file_signature))) {
       decompFile = tempfile(tmpdir=tmpdir)
       on.exit(unlink(decompFile), add=TRUE)
-      tryCatch({
-        R.utils::decompressFile(file, decompFile, ext=NULL, FUN=FUN, remove=FALSE)   # ext is not used by decompressFile when destname is supplied, but isn't optional
-      }, error = function(e) {
-        stopf("R.utils::decompressFile failed to decompress file '%s':\n  %s\n. This can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", file, conditionMessage(e), tmpdir)
-      })
+      if (w==4L || zstdsig) {
+        if (!haszstd())
+          stopf("To read .zst files, fread() requires zstd library support. data.table was compiled without zstd. Please reinstall data.table after installing the zstd development library (libzstd-dev on Debian/Ubuntu, libzstd-devel on Fedora/EPEL, zstd on Homebrew).") # nocov
+        tryCatch({
+          .Call(Cdt_zstd_decompress, file, decompFile)
+        }, error = function(e) {
+          stopf("zstd decompression of file '%s' failed:\n  %s\nThis can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", file, conditionMessage(e), tmpdir)
+        })
+      } else {
+        fmt = if (w<=2L || gzsig) "gz" else if (w==5L || xzsig) "xz" else "bz2"
+        if (!requireNamespace("R.utils", quietly = TRUE))
+          stopf("To read %s files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.", fmt) # nocov
+        # not worth doing a behavior test here, so just use getRversion().
+        if (packageVersion("R.utils") < "2.13.0" && base::getRversion() >= "4.5.0")
+          stopf("Reading compressed files in fread requires R.utils version 2.13.0 or higher. Please upgrade R.utils.") # nocov
+        FUN = if (w<=2L || gzsig) gzfile else if (w==5L || xzsig) xzfile else bzfile
+        tryCatch({
+          R.utils::decompressFile(file, decompFile, ext=NULL, FUN=FUN, remove=FALSE)   # ext is not used by decompressFile when destname is supplied, but isn't optional
+        }, error = function(e) {
+          stopf("R.utils::decompressFile failed to decompress file '%s':\n  %s\n. This can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", file, conditionMessage(e), tmpdir)
+        })
+      }
       file = decompFile   # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download
     }
     file = enc2native(file) # CfreadR cannot handle UTF-8 if that is not the native encoding, see #3078.
@@ -486,7 +500,9 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
 known_signatures = list(
   zip = as.raw(c(0x50, 0x4b, 0x03, 0x04)), # charToRaw("PK\x03\x04")
   gzip = as.raw(c(0x1F, 0x8B)),
-  bzip = as.raw(c(0x42, 0x5A, 0x68))
+  bzip = as.raw(c(0x42, 0x5A, 0x68)),
+  zstd = as.raw(c(0x28, 0xB5, 0x2F, 0xFD)), # zstd frame magic (little-endian 0xFD2FB528)
+  xz   = as.raw(c(0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00)) # https://tukaani.org/xz/xz-file-format.txt
 )
 
 # https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers
@@ -507,6 +523,16 @@ is_bzip = function(file_signature) {
     isTRUE(file_signature[4L] %in% charToRaw('123456789')) # for #6304
 }
 
+# https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#zstandard-frames
+is_zstd = function(file_signature) {
+  identical(file_signature[1:4], known_signatures$zstd)
+}
+
+# https://tukaani.org/xz/xz-file-format.txt section 2.1.1.1
+is_xz = function(file_signature) {
+  identical(file_signature[1:6], known_signatures$xz)
+}
+
 # simplified but faster version of `factor()` for internal use.
 as_factor = function(x) {
   lev = forderv(x, retGrp = TRUE, na.last = NA)

@@ -8,8 +8,8 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
            dateTimeAs = c("ISO","squash","epoch","write.csv"),
            buffMB=8L, nThread=getDTthreads(verbose),
            showProgress=getOption("datatable.showProgress", interactive()),
-           compress = c("auto", "none", "gzip"),
-           compressLevel = 6L,
+           compress = c("auto", "none", "gzip", "zstd"),
+           compressLevel = NULL,
            yaml = FALSE,
            bom = FALSE,
            verbose=getOption("datatable.verbose", FALSE),
@@ -26,7 +26,6 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
   scipen = if (is.numeric(scipen)) as.integer(scipen) else 0L
   buffMB = as.integer(buffMB)
   nThread = as.integer(nThread)
-  compressLevel = as.integer(compressLevel)
   # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape'
   # validate arguments
   if (is.matrix(x)) { # coerce to data.table if input object is matrix
@@ -48,8 +47,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
     `dec and sep must be distinct whenever both might be needed` = (!NROW(x) || NCOL(x) <= 1L || dec != sep),  # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present
     is.character(eol) && length(eol)==1L,
     length(qmethod) == 1L && qmethod %chin% c("double", "escape"),
-    length(compress) == 1L && compress %chin% c("auto", "none", "gzip"),
-    length(compressLevel) == 1L && 0L <= compressLevel && compressLevel <= 9L,
+    length(compress) == 1L && compress %chin% c("auto", "none", "gzip", "zstd"),
     isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names),
     isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01),
     isTRUEorFALSE(bom), isTRUEorFALSE(forceDecimal),
@@ -60,6 +58,11 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
   )
 
   is_gzip = compress == "gzip" || (compress == "auto" && endsWithAny(file, ".gz"))
+  is_zstd = compress == "zstd" || (compress == "auto" && endsWithAny(file, ".zst"))
+  if (is.null(compressLevel)) compressLevel = if (is_zstd) 3L else 6L
+  compressLevel = as.integer(compressLevel)
+  if (is_zstd) { if (compressLevel < 1L || compressLevel > 22L) stopf("compressLevel must be between 1 and 22 for zstd compression.") }
+  else         { if (compressLevel < 0L || compressLevel >  9L) stopf("compressLevel must be between 0 and 9 for gzip compression.") }
 
   file = path.expand(file)  # "~/foo/bar"
   if (append && (file=="" || file.exists(file))) {
@@ -123,8 +126,9 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
   }
   .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
         row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread,
-        showProgress, is_gzip, compressLevel, bom, yaml, verbose, encoding, forceDecimal)
+        showProgress, is_gzip, is_zstd, compressLevel, bom, yaml, verbose, encoding, forceDecimal)
   invisible()
 }
 
 haszlib = function() .Call(Cdt_has_zlib)
+haszstd = function() .Call(Cdt_has_zstd)
@@ -175,12 +175,38 @@ sed -e "s|@PKG_CFLAGS@|$PKG_CFLAGS|" -e "s|@PKG_LIBS@|$PKG_LIBS|" src/Makevars.i
 
 # optional dependency on zlib
 if [ "$NOZLIB" = "1" ]; then
-  echo "*** Compilation without compression support in fwrite"
+  echo "*** Compilation without gzip compression support in fwrite"
   sed -e "s|@zlib_cflags@|-DNOZLIB|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
   sed -e "s|@zlib_libs@||" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
 else
   sed -e "s|@zlib_cflags@|${cflag}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
   sed -e "s|@zlib_libs@|${lib}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
 fi
 
+# optional dependency on zstd
+NOZSTD=1
+pkg-config --exists libzstd
+if [ $? -ne 0 ]; then
+  echo "*** pkg-config cannot find libzstd; zstd compression will be disabled."
+  echo "*** To enable zstd support, install the zstd development library:"
+  echo "***   deb: libzstd-dev (Debian, Ubuntu, ...)"
+  echo "***   rpm: libzstd-devel (Fedora, EPEL, ...)"
+  echo "***   brew: zstd (macOS)"
+else
+  NOZSTD=0
+  zstd_lib=`pkg-config --libs libzstd`
+  zstd_cflag=`pkg-config --cflags libzstd`
+  zstd_version=`pkg-config --modversion libzstd`
+  echo "zstd ${zstd_version} is available ok"
+fi
+
+if [ "$NOZSTD" = "1" ]; then
+  echo "*** Compilation without zstd compression support in fwrite/fread"
+  sed -e "s|@zstd_cflags@|-DNOZSTD|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
+  sed -e "s|@zstd_libs@||" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
+else
+  sed -e "s|@zstd_cflags@|${zstd_cflag}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
+  sed -e "s|@zstd_libs@|${zstd_lib}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
+fi
+
 exit 0
@@ -85,6 +85,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   which.last = data.table:::which.last
   `-.IDate` = data.table:::`-.IDate`
   haszlib = data.table:::haszlib
+  haszstd = data.table:::haszstd
 
   # Also, for functions that are masked by other packages, we need to map the data.table one. Or else,
   # the other package's function would be picked up. As above, we only need to do this because we desire
@@ -9767,6 +9768,29 @@ if (haszlib()) {
   unlink(f)
 }
 
+# fwrite/fread zstd compress # mirror tests from above
+if (!haszstd()) {
+  test(1658.601, fwrite(data.table(a=1), file=tempfile(), compress="zstd"), error="header files were not found at the time data.table was compiled")
+} else {
+  test(1658.61, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="zstd"), output='a,b\n1,1\n2,2\n3,3')  # compress ignored on console
+  DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25))
+  test(1658.621, fwrite(DT, file=f1<-tempfile(fileext=".zst"), verbose=TRUE), NULL,
+                 output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1")
+  test(1658.622, fwrite(DT, file=f2<-tempfile()), NULL)
+  test(1658.623, file.info(f1)$size < file.info(f2)$size)
+  test(1658.63, fread(f1), DT)
+  fwrite(DT, file=f3<-tempfile(), compress="zstd")
+  fwrite(DT, file=f4<-tempfile(), compress="zstd", compressLevel=1)
+  fwrite(DT, file=f5<-tempfile(), compress="zstd", compressLevel=22)
+  fwrite(DT, file=f6<-tempfile(), compress="zstd", col.names=FALSE)
+  test(1658.641, file.info(f3)$size, file.info(f1)$size)
+  test(1658.642, file.info(f4)$size >= file.info(f1)$size)
+  test(1658.643, file.info(f1)$size >= file.info(f5)$size)
+  test(1658.644, fread(f6, col.names=c("a","b")), DT)
+  test(1658.645, fread(f3), DT)
+  unlink(c(f1,f2,f3,f4,f5,f6))
+}
+
 # complex column support for fwrite, part of #3690
 DT = data.table(a=1:3, z=0:2 - (2:0)*1i)
 test(1658.55, fwrite(DT), output='a,z\n1,0-2i\n2,1-1i\n3,2+0i')
@@ -18481,6 +18505,15 @@ if (test_R.utils) {
   R.utils::bzip2(tmp, tmp2 <- tempfile(), remove=FALSE) # _not_ with .bz2 extension
   test(2273.4, fread(tmp2), DT)
   file.remove(tmp2)
+  con = xzfile(tmp2 <- tempfile(fileext=".xz"), open="wt")
+  write.table(DT, con, sep = ",", row.names = FALSE, col.names = TRUE, quote = FALSE)
+  close(con)
+  test(2273.5, fread(tmp2), DT)
+  con = xzfile(tmp3 <- tempfile(), open="wt")
+  write.table(DT, con, sep = ",", row.names = FALSE, col.names = TRUE, quote = FALSE)
+  close(con)
+  test(2273.6, fread(tmp3), DT)
+  file.remove(c(tmp2, tmp3))
 }
 file.remove(tmp)
 

@@ -16,8 +16,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
   dateTimeAs = c("ISO","squash","epoch","write.csv"),
   buffMB = 8L, nThread = getDTthreads(verbose),
   showProgress = getOption("datatable.showProgress", interactive()),
-  compress = c("auto", "none", "gzip"),
-  compressLevel = 6L,
+  compress = c("auto", "none", "gzip", "zstd"),
+  compressLevel = NULL,
   yaml = FALSE,
   bom = FALSE,
   verbose = getOption("datatable.verbose", FALSE),
@@ -58,7 +58,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
   \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
   \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
   \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
-  \item{compressLevel}{Level of compression between 1 and 9, 6 by default. See \url{https://www.gnu.org/software/gzip/manual/html_node/Invoking-gzip.html} for details.}
+  \item{compressLevel}{Level of compression. For gzip, an integer between 0 and 9 (default 6). For zstd, an integer between 1 and 22 (default 3). When \code{NULL} (default), the format default is used. See \url{https://www.gnu.org/software/gzip/manual/html_node/Invoking-gzip.html} or \url{https://facebook.github.io/zstd/zstd_manual.html} for details.}
   \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. }
   \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.}
   \item{verbose}{Be chatty and report timings?}

@@ -1,5 +1,5 @@
-PKG_CFLAGS = $(C_VISIBILITY) @PKG_CFLAGS@ @zlib_cflags@
-PKG_LIBS = $(C_VISIBILITY) @PKG_LIBS@ @zlib_libs@
+PKG_CFLAGS = $(C_VISIBILITY) @PKG_CFLAGS@ @zlib_cflags@ @zstd_cflags@
+PKG_LIBS = $(C_VISIBILITY) @PKG_LIBS@ @zlib_libs@ @zstd_libs@
 # See WRE $1.2.1.1. But retain user supplied PKG_* too, #4664.
 # WRE states ($1.6) that += isn't portable and that we aren't allowed to use it.
 # Otherwise we could use the much simpler PKG_LIBS += @openmp_cflags@ -lz.

@@ -414,7 +414,7 @@ SEXP chmatchdup_R(SEXP, SEXP, SEXP);
 SEXP chin_R(SEXP, SEXP);
 SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 SEXP spillConnectionToFile(SEXP, SEXP, SEXP);
-SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
 SEXP setlistelt(SEXP, SEXP, SEXP);
 SEXP setS4elt(SEXP, SEXP, SEXP);
@@ -469,6 +469,9 @@ SEXP allNAR(SEXP);
 SEXP test_dt_win_snprintf(void);
 SEXP dt_zlib_version(void);
 SEXP dt_has_zlib(void);
+SEXP dt_zstd_version(void);
+SEXP dt_has_zstd(void);
+SEXP dt_zstd_decompress(SEXP, SEXP);
 SEXP startsWithAny(SEXP, SEXP, SEXP);
 SEXP convertDate(SEXP, SEXP);
 SEXP fastmean(SEXP);