Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 39 additions & 13 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -209,20 +209,34 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
}

gzsig = FALSE
if ((w <- endsWithAny(file, c(".gz", ".bgz",".bz2"))) || (gzsig <- is_gzip(file_signature)) || is_bzip(file_signature)) {
if (!requireNamespace("R.utils", quietly = TRUE))
stopf("To read %s files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.", if (w<=2L || gzsig) "gz" else "bz2") # nocov
# not worth doing a behavior test here, so just use getRversion().
if (packageVersion("R.utils") < "2.13.0" && base::getRversion() >= "4.5.0")
stopf("Reading compressed files in fread requires R.utils version 2.13.0 or higher. Please upgrade R.utils.") # nocov
FUN = if (w<=2L || gzsig) gzfile else bzfile
zstdsig = FALSE
xzsig = FALSE
if ((w <- endsWithAny(file, c(".gz", ".bgz", ".bz2", ".zst", ".xz"))) || (gzsig <- is_gzip(file_signature)) || is_bzip(file_signature) ||
(zstdsig <- is_zstd(file_signature)) || (xzsig <- is_xz(file_signature))) {
decompFile = tempfile(tmpdir=tmpdir)
on.exit(unlink(decompFile), add=TRUE)
tryCatch({
R.utils::decompressFile(file, decompFile, ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional
}, error = function(e) {
stopf("R.utils::decompressFile failed to decompress file '%s':\n %s\n. This can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", file, conditionMessage(e), tmpdir)
})
if (w==4L || zstdsig) {
if (!haszstd())
stopf("To read .zst files, fread() requires zstd library support. data.table was compiled without zstd. Please reinstall data.table after installing the zstd development library (libzstd-dev on Debian/Ubuntu, libzstd-devel on Fedora/EPEL, zstd on Homebrew).") # nocov
tryCatch({
.Call(Cdt_zstd_decompress, file, decompFile)
}, error = function(e) {
stopf("zstd decompression of file '%s' failed:\n %s\nThis can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", file, conditionMessage(e), tmpdir)
})
} else {
fmt = if (w<=2L || gzsig) "gz" else if (w==5L || xzsig) "xz" else "bz2"
if (!requireNamespace("R.utils", quietly = TRUE))
stopf("To read %s files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.", fmt) # nocov
# not worth doing a behavior test here, so just use getRversion().
if (packageVersion("R.utils") < "2.13.0" && base::getRversion() >= "4.5.0")
stopf("Reading compressed files in fread requires R.utils version 2.13.0 or higher. Please upgrade R.utils.") # nocov
FUN = if (w<=2L || gzsig) gzfile else if (w==5L || xzsig) xzfile else bzfile
tryCatch({
R.utils::decompressFile(file, decompFile, ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional
}, error = function(e) {
stopf("R.utils::decompressFile failed to decompress file '%s':\n %s\n. This can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", file, conditionMessage(e), tmpdir)
})
}
file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download
}
file = enc2native(file) # CfreadR cannot handle UTF-8 if that is not the native encoding, see #3078.
Expand Down Expand Up @@ -486,7 +500,9 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
known_signatures = list(
zip = as.raw(c(0x50, 0x4b, 0x03, 0x04)), # charToRaw("PK\x03\x04")
gzip = as.raw(c(0x1F, 0x8B)),
bzip = as.raw(c(0x42, 0x5A, 0x68))
bzip = as.raw(c(0x42, 0x5A, 0x68)),
zstd = as.raw(c(0x28, 0xB5, 0x2F, 0xFD)), # zstd frame magic (little-endian 0xFD2FB528)
xz = as.raw(c(0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00)) # https://tukaani.org/xz/xz-file-format.txt
)

# https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers
Expand All @@ -507,6 +523,16 @@ is_bzip = function(file_signature) {
isTRUE(file_signature[4L] %in% charToRaw('123456789')) # for #6304
}

# https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#zstandard-frames
is_zstd = function(file_signature) {
identical(file_signature[1:4], known_signatures$zstd)
}

# https://tukaani.org/xz/xz-file-format.txt section 2.1.1.1
is_xz = function(file_signature) {
identical(file_signature[1:6], known_signatures$xz)
}

# simplified but faster version of `factor()` for internal use.
as_factor = function(x) {
lev = forderv(x, retGrp = TRUE, na.last = NA)
Expand Down
16 changes: 10 additions & 6 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
dateTimeAs = c("ISO","squash","epoch","write.csv"),
buffMB=8L, nThread=getDTthreads(verbose),
showProgress=getOption("datatable.showProgress", interactive()),
compress = c("auto", "none", "gzip"),
compressLevel = 6L,
compress = c("auto", "none", "gzip", "zstd"),
compressLevel = NULL,
yaml = FALSE,
bom = FALSE,
verbose=getOption("datatable.verbose", FALSE),
Expand All @@ -26,7 +26,6 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
scipen = if (is.numeric(scipen)) as.integer(scipen) else 0L
buffMB = as.integer(buffMB)
nThread = as.integer(nThread)
compressLevel = as.integer(compressLevel)
# write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape'
# validate arguments
if (is.matrix(x)) { # coerce to data.table if input object is matrix
Expand All @@ -48,8 +47,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
`dec and sep must be distinct whenever both might be needed` = (!NROW(x) || NCOL(x) <= 1L || dec != sep), # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present
is.character(eol) && length(eol)==1L,
length(qmethod) == 1L && qmethod %chin% c("double", "escape"),
length(compress) == 1L && compress %chin% c("auto", "none", "gzip"),
length(compressLevel) == 1L && 0L <= compressLevel && compressLevel <= 9L,
length(compress) == 1L && compress %chin% c("auto", "none", "gzip", "zstd"),
isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names),
isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01),
isTRUEorFALSE(bom), isTRUEorFALSE(forceDecimal),
Expand All @@ -60,6 +58,11 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
)

is_gzip = compress == "gzip" || (compress == "auto" && endsWithAny(file, ".gz"))
is_zstd = compress == "zstd" || (compress == "auto" && endsWithAny(file, ".zst"))
if (is.null(compressLevel)) compressLevel = if (is_zstd) 3L else 6L
compressLevel = as.integer(compressLevel)
if (is_zstd) { if (compressLevel < 1L || compressLevel > 22L) stopf("compressLevel must be between 1 and 22 for zstd compression.") }
else { if (compressLevel < 0L || compressLevel > 9L) stopf("compressLevel must be between 0 and 9 for gzip compression.") }

file = path.expand(file) # "~/foo/bar"
if (append && (file=="" || file.exists(file))) {
Expand Down Expand Up @@ -123,8 +126,9 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
}
.Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread,
showProgress, is_gzip, compressLevel, bom, yaml, verbose, encoding, forceDecimal)
showProgress, is_gzip, is_zstd, compressLevel, bom, yaml, verbose, encoding, forceDecimal)
invisible()
}

haszlib = function() .Call(Cdt_has_zlib)
haszstd = function() .Call(Cdt_has_zstd)
28 changes: 27 additions & 1 deletion configure
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,38 @@ sed -e "s|@PKG_CFLAGS@|$PKG_CFLAGS|" -e "s|@PKG_LIBS@|$PKG_LIBS|" src/Makevars.i

# optional dependency on zlib
if [ "$NOZLIB" = "1" ]; then
echo "*** Compilation without compression support in fwrite"
echo "*** Compilation without gzip compression support in fwrite"
sed -e "s|@zlib_cflags@|-DNOZLIB|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
sed -e "s|@zlib_libs@||" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
else
sed -e "s|@zlib_cflags@|${cflag}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
sed -e "s|@zlib_libs@|${lib}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
fi

# optional dependency on zstd
NOZSTD=1
pkg-config --exists libzstd
if [ $? -ne 0 ]; then
echo "*** pkg-config cannot find libzstd; zstd compression will be disabled."
echo "*** To enable zstd support, install the zstd development library:"
echo "*** deb: libzstd-dev (Debian, Ubuntu, ...)"
echo "*** rpm: libzstd-devel (Fedora, EPEL, ...)"
echo "*** brew: zstd (macOS)"
else
NOZSTD=0
zstd_lib=`pkg-config --libs libzstd`
zstd_cflag=`pkg-config --cflags libzstd`
zstd_version=`pkg-config --modversion libzstd`
echo "zstd ${zstd_version} is available ok"
fi

if [ "$NOZSTD" = "1" ]; then
echo "*** Compilation without zstd compression support in fwrite/fread"
sed -e "s|@zstd_cflags@|-DNOZSTD|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
sed -e "s|@zstd_libs@||" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
else
sed -e "s|@zstd_cflags@|${zstd_cflag}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
sed -e "s|@zstd_libs@|${zstd_lib}|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars
fi

exit 0
33 changes: 33 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
which.last = data.table:::which.last
`-.IDate` = data.table:::`-.IDate`
haszlib = data.table:::haszlib
haszstd = data.table:::haszstd

# Also, for functions that are masked by other packages, we need to map the data.table one. Or else,
# the other package's function would be picked up. As above, we only need to do this because we desire
Expand Down Expand Up @@ -9767,6 +9768,29 @@ if (haszlib()) {
unlink(f)
}

# fwrite/fread zstd compress # mirror tests from above
if (!haszstd()) {
test(1658.601, fwrite(data.table(a=1), file=tempfile(), compress="zstd"), error="header files were not found at the time data.table was compiled")
} else {
test(1658.61, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="zstd"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console
DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25))
test(1658.621, fwrite(DT, file=f1<-tempfile(fileext=".zst"), verbose=TRUE), NULL,
output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1")
test(1658.622, fwrite(DT, file=f2<-tempfile()), NULL)
test(1658.623, file.info(f1)$size < file.info(f2)$size)
test(1658.63, fread(f1), DT)
fwrite(DT, file=f3<-tempfile(), compress="zstd")
fwrite(DT, file=f4<-tempfile(), compress="zstd", compressLevel=1)
fwrite(DT, file=f5<-tempfile(), compress="zstd", compressLevel=22)
fwrite(DT, file=f6<-tempfile(), compress="zstd", col.names=FALSE)
test(1658.641, file.info(f3)$size, file.info(f1)$size)
test(1658.642, file.info(f4)$size >= file.info(f1)$size)
test(1658.643, file.info(f1)$size >= file.info(f5)$size)
test(1658.644, fread(f6, col.names=c("a","b")), DT)
test(1658.645, fread(f3), DT)
unlink(c(f1,f2,f3,f4,f5,f6))
}

# complex column support for fwrite, part of #3690
DT = data.table(a=1:3, z=0:2 - (2:0)*1i)
test(1658.55, fwrite(DT), output='a,z\n1,0-2i\n2,1-1i\n3,2+0i')
Expand Down Expand Up @@ -18481,6 +18505,15 @@ if (test_R.utils) {
R.utils::bzip2(tmp, tmp2 <- tempfile(), remove=FALSE) # _not_ with .bz2 extension
test(2273.4, fread(tmp2), DT)
file.remove(tmp2)
con = xzfile(tmp2 <- tempfile(fileext=".xz"), open="wt")
write.table(DT, con, sep = ",", row.names = FALSE, col.names = TRUE, quote = FALSE)
close(con)
test(2273.5, fread(tmp2), DT)
con = xzfile(tmp3 <- tempfile(), open="wt")
write.table(DT, con, sep = ",", row.names = FALSE, col.names = TRUE, quote = FALSE)
close(con)
test(2273.6, fread(tmp3), DT)
file.remove(c(tmp2, tmp3))
}
file.remove(tmp)

Expand Down
6 changes: 3 additions & 3 deletions man/fwrite.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
dateTimeAs = c("ISO","squash","epoch","write.csv"),
buffMB = 8L, nThread = getDTthreads(verbose),
showProgress = getOption("datatable.showProgress", interactive()),
compress = c("auto", "none", "gzip"),
compressLevel = 6L,
compress = c("auto", "none", "gzip", "zstd"),
compressLevel = NULL,
yaml = FALSE,
bom = FALSE,
verbose = getOption("datatable.verbose", FALSE),
Expand Down Expand Up @@ -58,7 +58,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
\item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
\item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
\item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
\item{compressLevel}{Level of compression between 1 and 9, 6 by default. See \url{https://www.gnu.org/software/gzip/manual/html_node/Invoking-gzip.html} for details.}
\item{compressLevel}{Level of compression. For gzip, an integer between 0 and 9 (default 6). For zstd, an integer between 1 and 22 (default 3). When \code{NULL} (default), the format default is used. See \url{https://www.gnu.org/software/gzip/manual/html_node/Invoking-gzip.html} or \url{https://facebook.github.io/zstd/zstd_manual.html} for details.}
\item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. }
\item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.}
\item{verbose}{Be chatty and report timings?}
Expand Down
4 changes: 2 additions & 2 deletions src/Makevars.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
PKG_CFLAGS = $(C_VISIBILITY) @PKG_CFLAGS@ @zlib_cflags@
PKG_LIBS = $(C_VISIBILITY) @PKG_LIBS@ @zlib_libs@
PKG_CFLAGS = $(C_VISIBILITY) @PKG_CFLAGS@ @zlib_cflags@ @zstd_cflags@
PKG_LIBS = $(C_VISIBILITY) @PKG_LIBS@ @zlib_libs@ @zstd_libs@
# See WRE $1.2.1.1. But retain user supplied PKG_* too, #4664.
# WRE states ($1.6) that += isn't portable and that we aren't allowed to use it.
# Otherwise we could use the much simpler PKG_LIBS += @openmp_cflags@ -lz.
Expand Down
5 changes: 4 additions & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ SEXP chmatchdup_R(SEXP, SEXP, SEXP);
SEXP chin_R(SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP spillConnectionToFile(SEXP, SEXP, SEXP);
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP setlistelt(SEXP, SEXP, SEXP);
SEXP setS4elt(SEXP, SEXP, SEXP);
Expand Down Expand Up @@ -469,6 +469,9 @@ SEXP allNAR(SEXP);
SEXP test_dt_win_snprintf(void);
SEXP dt_zlib_version(void);
SEXP dt_has_zlib(void);
SEXP dt_zstd_version(void);
SEXP dt_has_zstd(void);
SEXP dt_zstd_decompress(SEXP, SEXP);
SEXP startsWithAny(SEXP, SEXP, SEXP);
SEXP convertDate(SEXP, SEXP);
SEXP fastmean(SEXP);
Loading
Loading