>From 41e9fbba7e2dc3a9756f85df5dd5074c0680208e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Thu, 2 Oct 2014 14:07:42 +0100 Subject: [PATCH 1/3] cp: make hole detection size independent of I/O size Previously cp would not detect runs of NULs that were smaller than the buffer size used for I/O (currently 128KiB). * src/copy.c (copy_reg): Use an independent hole_size, set to st_blksize, to increase the chances of detecting a representable hole, in a run of NULs read from the input. * tests/cp/sparse.sh: Add test cases for various sparse chunk sizes. * NEWS: Mention the improvement. --- NEWS | 3 + src/copy.c | 114 +++++++++++++++++++++++++++++++++------------------- tests/cp/sparse.sh | 28 +++++++++++++ 3 files changed, 103 insertions(+), 42 deletions(-) diff --git a/NEWS b/NEWS index 1811ae4..785773f 100644 --- a/NEWS +++ b/NEWS @@ -30,6 +30,9 @@ GNU coreutils NEWS -*- outline -*- ** Improvements + cp will convert smaller runs of NULs in the input to holes, + to reduce allocation in the copy. + mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/src/copy.c b/src/copy.c index b7baee4..12af6db 100644 --- a/src/copy.c +++ b/src/copy.c @@ -158,7 +158,7 @@ utimens_symlink (char const *file, struct timespec const *timespec) bytes read. */ static bool sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - bool make_holes, + size_t hole_size, bool make_holes, char const *src_name, char const *dst_name, uintmax_t max_n_read, off_t *total_n_read, bool *last_write_made_hole) @@ -168,8 +168,6 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, while (max_n_read) { - bool make_hole = false; - ssize_t n_read = read (src_fd, buf, MIN (max_n_read, buf_size)); if (n_read < 0) { @@ -183,47 +181,77 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, max_n_read -= n_read; *total_n_read += n_read; - if (make_holes) + /* Loop over the input buffer in chunks of hole_size. */ + bool make_hole = false; + size_t csize = make_holes ? hole_size : buf_size; + char *cbuf = buf; + size_t psize = 0; + char *pbuf = buf; + + while (n_read) { - /* Sentinel required by is_nul(). */ - buf[n_read] = '\1'; -#ifdef lint - typedef uintptr_t word; - /* Usually, buf[n_read] is not the byte just before a "word" - (aka uintptr_t) boundary. In that case, the word-oriented - test below (*wp++ == 0) would read some uninitialized bytes - after the sentinel. To avoid false-positive reports about - this condition (e.g., from a tool like valgrind), set the - remaining bytes -- to any value. */ - memset (buf + n_read + 1, 0, sizeof (word) - 1); -#endif + bool prev_hole = make_hole; + csize = MIN (csize, n_read); - if ((make_hole = is_nul (buf, n_read))) + if (make_holes && csize) { - if (lseek (dest_fd, n_read, SEEK_CUR) < 0) - { - error (0, errno, _("cannot lseek %s"), quote (dst_name)); - return false; - } + /* Setup sentinel required by is_nul(). */ + typedef uintptr_t word; + word isnul_tmp; + memcpy (&isnul_tmp, cbuf + csize, sizeof (word)); + memset (cbuf + csize, 1, sizeof (word)); + + make_hole = is_nul (cbuf, csize); + + memcpy (cbuf + csize, &isnul_tmp, sizeof (word)); } - } - if (!make_hole) - { - size_t n = n_read; - if (full_write (dest_fd, buf, n) != n) + bool transition = (make_hole != prev_hole) && psize; + bool last_chunk = (n_read == csize) || ! csize; + + if (transition || last_chunk) { - error (0, errno, _("error writing %s"), quote (dst_name)); - return false; + if (! transition) + psize += csize; + + if (! prev_hole) + { + if (full_write (dest_fd, pbuf, psize) != psize) + { + error (0, errno, _("error writing %s"), quote (dst_name)); + return false; + } + } + else + { + if (lseek (dest_fd, psize, SEEK_CUR) < 0) + { + error (0, errno, _("cannot lseek %s"), quote (dst_name)); + return false; + } + } + + pbuf += psize; + psize = csize; + + if (transition && last_chunk) + csize = 0; + else if (! csize) + n_read = 0; } + else /* Coalesce writes/seeks. */ + psize += csize; - /* It is tempting to return early here upon a short read from a - regular file. That would save the final read syscall for each - file. Unfortunately that doesn't work for certain files in - /proc with linux kernels from at least 2.6.9 .. 2.6.29. */ + n_read -= csize; + cbuf += csize; } *last_write_made_hole = make_hole; + + /* It's tempting to break early here upon a short read from + a regular file. That would save the final read syscall + for each file. Unfortunately that doesn't work for + certain files in /proc or /sys with linux kernels. */ } return true; @@ -290,7 +318,8 @@ write_zeros (int fd, off_t n_bytes) return false. */ static bool extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - off_t src_total_size, enum Sparse_type sparse_mode, + size_t hole_size, off_t src_total_size, + enum Sparse_type sparse_mode, char const *src_name, char const *dst_name, bool *require_normal_copy) { @@ -331,7 +360,7 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, { off_t ext_start; off_t ext_len; - off_t hole_size; + off_t ext_hole_size; if (i < scan.ei_count) { @@ -345,11 +374,11 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, ext_len = 0; } - hole_size = ext_start - last_ext_start - last_ext_len; + ext_hole_size = ext_start - last_ext_start - last_ext_len; wrote_hole_at_eof = false; - if (hole_size) + if (ext_hole_size) { if (lseek (src_fd, ext_start, SEEK_SET) < 0) { @@ -374,9 +403,9 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, /* When not inducing holes and when there is a hole between the end of the previous extent and the beginning of the current one, write zeros to the destination file. */ - off_t nzeros = hole_size; + off_t nzeros = ext_hole_size; if (empty_extent) - nzeros = MIN (src_total_size - dest_pos, hole_size); + nzeros = MIN (src_total_size - dest_pos, ext_hole_size); if (! write_zeros (dest_fd, nzeros)) { @@ -409,7 +438,7 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, empty_extent = false; last_ext_len = ext_len; - if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, + if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, hole_size, sparse_mode == SPARSE_ALWAYS, src_name, dst_name, ext_len, &n_read, &wrote_hole_at_eof)) @@ -1105,6 +1134,7 @@ copy_reg (char const *src_name, char const *dst_name, size_t buf_alignment = lcm (getpagesize (), sizeof (word)); size_t buf_alignment_slop = sizeof (word) + buf_alignment - 1; size_t buf_size = io_blksize (sb); + size_t hole_size = ST_BLKSIZE (sb); fdadvise (source_desc, 0, 0, FADVISE_SEQUENTIAL); @@ -1164,7 +1194,7 @@ copy_reg (char const *src_name, char const *dst_name, standard copy only if the initial extent scan fails. If the '--sparse=never' option is specified, write all data but use any extents to read more efficiently. */ - if (extent_copy (source_desc, dest_desc, buf, buf_size, + if (extent_copy (source_desc, dest_desc, buf, buf_size, hole_size, src_open_sb.st_size, S_ISREG (sb.st_mode) ? x->sparse_mode : SPARSE_NEVER, src_name, dst_name, &normal_copy_required)) @@ -1179,7 +1209,7 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, + if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, hole_size, make_holes, src_name, dst_name, UINTMAX_MAX, &n_read, &wrote_hole_at_eof) diff --git a/tests/cp/sparse.sh b/tests/cp/sparse.sh index d6cc4c4..1414d35 100755 --- a/tests/cp/sparse.sh +++ b/tests/cp/sparse.sh @@ -37,4 +37,32 @@ test $(stat --printf %b copy) -le $(stat --printf %b sparse) || fail=1 cp --sparse=always --reflink sparse copy && fail=1 cp --sparse=never --reflink sparse copy && fail=1 + +# Ensure we handle sparse/non-sparse transitions correctly +maxn=128 # how many $hole_size chunks per file +hole_size=$(stat -c %o copy) +dd if=/dev/zero bs=$hole_size count=$maxn of=zeros +tr '\0' '\1' < zeros > ones + +for n in 1 2 3 4 32 $maxn; do + parts=$(expr $maxn / $n) + + rm -f sparse.in + + # Generate sparse file for copying with alternating + # hole/data patterns of size n * $hole_size + for i in $(yes zeros | sed 1~2s/zeros/ones/ | head -n$parts); do + dd iflag=fullblock if=$i of=sparse.in conv=notrunc oflag=append \ + bs=$hole_size count=$n status=none || framework_failure_ + done + + cp --sparse=always sparse.in sparse.out || fail=1 # non sparse input + cp --sparse=always sparse.out sparse.out2 || fail=1 # sparse input + + cmp sparse.in sparse.out || fail=1 + cmp sparse.in sparse.out2 || fail=1 + + ls -lsh sparse.* +done + Exit $fail -- 1.7.7.6 >From b3a933cff8e61e57f9e09931daecb1440849d80d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 6 Oct 2014 10:19:58 +0100 Subject: [PATCH 2/3] cp: avoid speculative preallocation with --sparse=always With --sparse=always use fallocate(...PUNCH_HOLE...) to avoid any permanent allocation due to speculative preallocation employed by file systems such as XFS. * m4/jm-macros.m4: Check for and fallocate(). * src/copy.c (punch_hole): A new function to try and punch a hole an the specified offset if supported. (sparse_copy): Call punch_hole() after requesting a hole. (extent_copy): Likewise. * NEWS: Mention the improvement. --- NEWS | 4 +- m4/jm-macros.m4 | 2 + src/copy.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 62 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index 785773f..c48806f 100644 --- a/NEWS +++ b/NEWS @@ -30,8 +30,8 @@ GNU coreutils NEWS -*- outline -*- ** Improvements - cp will convert smaller runs of NULs in the input to holes, - to reduce allocation in the copy. + cp will convert smaller runs of NULs in the input to holes, and with + --sparse=always avoid speculative preallocation on XFS for example. mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/m4/jm-macros.m4 b/m4/jm-macros.m4 index a96ecab..07b9085 100644 --- a/m4/jm-macros.m4 +++ b/m4/jm-macros.m4 @@ -78,6 +78,7 @@ AC_DEFUN([coreutils_MACROS], AC_CHECK_FUNCS_ONCE([ endgrent endpwent + fallocate fchown fchmod ftruncate @@ -189,6 +190,7 @@ AC_DEFUN([gl_CHECK_ALL_HEADERS], [ AC_CHECK_HEADERS_ONCE([ hurd.h + linux/falloc.h paths.h priv.h stropts.h diff --git a/src/copy.c b/src/copy.c index 12af6db..f39186f 100644 --- a/src/copy.c +++ b/src/copy.c @@ -70,6 +70,10 @@ # include "verror.h" #endif +#if HAVE_LINUX_FALLOC_H +# include +#endif + #ifndef HAVE_FCHOWN # define HAVE_FCHOWN false # define fchown(fd, uid, gid) (-1) @@ -145,6 +149,26 @@ utimens_symlink (char const *file, struct timespec const *timespec) return err; } +/* Attempt to punch a hole to avoid any permanent + speculative preallocation on file systems such as XFS. + Return values as per fallocate(2) except ENOSYS etc. are ignored. */ + +static int +punch_hole (int fd, off_t offset, off_t length) +{ + int ret = 0; +#if HAVE_FALLOCATE +# if defined FALLOC_FL_PUNCH_HOLE && defined FALLOC_FL_KEEP_SIZE + ret = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, length); + if (ret < 0 + && (errno == EOPNOTSUPP || errno == ENOTSUP || errno == ENOSYS)) + ret = 0; +# endif +#endif + return ret; +} + /* Copy the regular file open on SRC_FD/SRC_NAME to DST_FD/DST_NAME, honoring the MAKE_HOLES setting and using the BUF_SIZE-byte buffer BUF for temporary storage. Copy no more than MAX_N_READ bytes. @@ -158,7 +182,7 @@ utimens_symlink (char const *file, struct timespec const *timespec) bytes read. */ static bool sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - size_t hole_size, bool make_holes, + size_t hole_size, bool punch_holes, char const *src_name, char const *dst_name, uintmax_t max_n_read, off_t *total_n_read, bool *last_write_made_hole) @@ -183,7 +207,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, /* Loop over the input buffer in chunks of hole_size. */ bool make_hole = false; - size_t csize = make_holes ? hole_size : buf_size; + size_t csize = hole_size ? hole_size : buf_size; char *cbuf = buf; size_t psize = 0; char *pbuf = buf; @@ -193,7 +217,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, bool prev_hole = make_hole; csize = MIN (csize, n_read); - if (make_holes && csize) + if (hole_size && csize) { /* Setup sentinel required by is_nul(). */ typedef uintptr_t word; @@ -224,11 +248,20 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, } else { - if (lseek (dest_fd, psize, SEEK_CUR) < 0) + off_t file_end = lseek (dest_fd, psize, SEEK_CUR); + if (file_end < 0) { error (0, errno, _("cannot lseek %s"), quote (dst_name)); return false; } + + if (punch_holes + && punch_hole (dest_fd, file_end - psize, psize) < 0) + { + error (0, errno, _("error deallocating %s"), + quote (dst_name)); + return false; + } } pbuf += psize; @@ -396,6 +429,14 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, error (0, errno, _("cannot lseek %s"), quote (dst_name)); goto fail; } + if (sparse_mode == SPARSE_ALWAYS + && punch_hole (dest_fd, ext_start - ext_hole_size, + ext_hole_size) < 0) + { + error (0, errno, _("error deallocating %s"), + quote (dst_name)); + goto fail; + } wrote_hole_at_eof = true; } else @@ -438,9 +479,9 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, empty_extent = false; last_ext_len = ext_len; - if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, hole_size, - sparse_mode == SPARSE_ALWAYS, - src_name, dst_name, ext_len, &n_read, + if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, + sparse_mode == SPARSE_ALWAYS ? hole_size: 0, + true, src_name, dst_name, ext_len, &n_read, &wrote_hole_at_eof)) goto fail; @@ -482,6 +523,13 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, return false; } + if (sparse_mode == SPARSE_ALWAYS && dest_pos < src_total_size + && punch_hole (dest_fd, dest_pos, src_total_size - dest_pos) < 0) + { + error (0, errno, _("error deallocating %s"), quote (dst_name)); + return false; + } + return true; } @@ -1209,8 +1257,9 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, hole_size, - make_holes, src_name, dst_name, + if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, + make_holes ? hole_size : 0, + x->sparse_mode == SPARSE_ALWAYS, src_name, dst_name, UINTMAX_MAX, &n_read, &wrote_hole_at_eof) || (wrote_hole_at_eof -- 1.7.7.6 >From 8b7dea30a1ab95c9ae721f9e261ec2db92bfd76d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 6 Oct 2014 11:02:34 +0100 Subject: [PATCH 3/3] cp: read sparse files more efficiently with non regular destination * src.copy.c (copy_reg): Use fiemap to read sparse files, even if the output is not to a regular file. * NEWS: Mention the improvement. --- NEWS | 3 +++ src/copy.c | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index c48806f..8ef5443 100644 --- a/NEWS +++ b/NEWS @@ -33,6 +33,9 @@ GNU coreutils NEWS -*- outline -*- cp will convert smaller runs of NULs in the input to holes, and with --sparse=always avoid speculative preallocation on XFS for example. + cp will read sparse files more efficiently when the destination is a + non regular file. For example when copying a disk image to a device node. + mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/src/copy.c b/src/copy.c index f39186f..477deaa 100644 --- a/src/copy.c +++ b/src/copy.c @@ -1188,7 +1188,7 @@ copy_reg (char const *src_name, char const *dst_name, /* Deal with sparse files. */ bool make_holes = false; - bool sparse_src = false; + bool sparse_src = is_probably_sparse (&src_open_sb); if (S_ISREG (sb.st_mode)) { @@ -1201,7 +1201,6 @@ copy_reg (char const *src_name, char const *dst_name, blocks. If the file has fewer blocks than would normally be needed for a file of its size, then at least one of the blocks in the file is a hole. */ - sparse_src = is_probably_sparse (&src_open_sb); if (x->sparse_mode == SPARSE_AUTO && sparse_src) make_holes = true; } @@ -1244,7 +1243,7 @@ copy_reg (char const *src_name, char const *dst_name, any extents to read more efficiently. */ if (extent_copy (source_desc, dest_desc, buf, buf_size, hole_size, src_open_sb.st_size, - S_ISREG (sb.st_mode) ? x->sparse_mode : SPARSE_NEVER, + make_holes ? x->sparse_mode : SPARSE_NEVER, src_name, dst_name, &normal_copy_required)) goto preserve_metadata; -- 1.7.7.6