qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/3 v9] add-cow file format


From: Stefan Hajnoczi
Subject: Re: [Qemu-devel] [PATCH 1/3 v9] add-cow file format
Date: Tue, 29 May 2012 16:50:08 +0100

> +    image_sectors = image_bs->total_sectors;

Please use bdrv_getlength() instead of accessing total_sectors directly.

> +    image_drv = bdrv_find_format("raw");
> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
> +    if (ret < 0) {
> +        bdrv_delete(s->image_hd);
> +        goto fail;
> +    }
> +    bs->total_sectors = s->image_hd->total_sectors;

bdrv_getlength()

> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, int *num_same)
> +{
> +    int changed;
> +    int64_t cluster_num;
> +
> +    if (nb_sectors == 0) {
> +        *num_same = 0;
> +        return 0;
> +    }
> +
> +    cluster_num = sector_num / SECTORS_PER_CLUSTER;
> +    changed = is_allocated(bs, sector_num);

Do we need to hold the lock before (indirectly) accessing the cache?

> +    *num_same =
> +        MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - 
> sector_num);
> +
> +    for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1;
> +        cluster_num <= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER;
> +        cluster_num++) {
> +        if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) {
> +            break;
> +        }
> +        *num_same = MIN(nb_sectors,
> +            (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
> +    }

I think this makes sense but it would be easier to use a loop counter
that is in sector units instead of clusters.  Then you can calculate
*num_name after the loop simply by subtracting the starting sector_num
from the final cur_sector value.  And it saves you from constantly
converting back and forth between clusters and sectors.

> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs,
> +        int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int ret = 0, i;
> +    QEMUIOVector hd_qiov;
> +    uint8_t *table;
> +    bool changed = false;
> +
> +    qemu_co_mutex_lock(&s->lock);
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +    ret = bdrv_co_writev(s->image_hd,
> +                     sector_num,
> +                     remaining_sectors, qiov);

We don't need to lock across all writes.

lock()
if write requires allocation:
    ...do allocation stuff under lock...
unlock()
write data

Internal data structure (cache, metadata, and copying unmodified
sectors) access needs to be locked during allocating writes.  The
guest's data can be written without the lock held.

This means that most writes will only lock for a short time to check
that the clusters are already allocated.  Then they will be able to
write in parallel.

If any cluster is not yet allocated then the allocation needs to
happen under lock.  This ensures that we don't copy backing file
sectors while processing another write request that touches those
sectors.

> +
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +    /* copy content of unmodified sectors */
> +    if (!is_cluster_head(sector_num) && !is_allocated(bs, sector_num)) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = copy_sectors(bs, sector_num & ~(SECTORS_PER_CLUSTER - 1),
> +            sector_num);

As mentioned above, I think we need to lock during copy_sectors() so
that other requests cannot race with this.  (The guest's writes must
always take priority over copying unmodified sectors.)

> +        qemu_co_mutex_lock(&s->lock);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
> +    if (!is_cluster_tail(sector_num + remaining_sectors - 1)
> +        && !is_allocated(bs, sector_num + remaining_sectors - 1)) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = copy_sectors(bs, sector_num + remaining_sectors,
> +            ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) + 
> 1);
> +        qemu_co_mutex_lock(&s->lock);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
> +    for (i = sector_num / SECTORS_PER_CLUSTER;
> +        i <= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER;
> +        i++) {
> +        ret = add_cow_cache_get(bs, s->bitmap_cache,
> +            i * SECTORS_PER_CLUSTER, (void **)&table);
> +        if (ret < 0) {
> +            return 0;

return ret?

> +        }
> +        if ((table[i / 8] & (1 << (i % 8))) == 0) {
> +            table[i / 8] |= (1 << (i % 8));
> +            changed = true;
> +            add_cow_cache_entry_mark_dirty(s->bitmap_cache, table);
> +        }
> +
> +    }
> +    ret = 0;
> +fail:
> +    if (changed) {
> +        ret = add_cow_cache_flush(bs, s->bitmap_cache);
> +    }
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
> +{
> +    BDRVAddCowState *s = bs->opaque;
> +    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
> +    int ret;
> +    int64_t old_image_sector = s->image_hd->total_sectors;
> +    int64_t bitmap_size =
> +        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
> +
> +    ret = bdrv_truncate(bs->file,
> +        sizeof(AddCowHeader) + bitmap_size);
> +    if (ret < 0) {
> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);

Why truncate image_hd on failure?  We never touch the image_hd size on
success either.  I think we can just leave it alone.

> +typedef struct AddCowHeader {
> +    uint64_t        magic;
> +    uint32_t        version;
> +    char            backing_file[ADD_COW_FILE_LEN];
> +    char            image_file[ADD_COW_FILE_LEN];
> +    char            reserved[500];
> +} QEMU_PACKED AddCowHeader;

I'd be tempted to start the bitmap at 4096 bytes into the file.  On 4
KB block size disks that would be a nice alignment and it doesn't
waste much additional space (the disk images we're trying to manage
are many GB :)).

> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
>     }
>
>     /* Create the new image */
> +
> +    if (0 == strcmp(out_fmt, "add-cow")) {
> +        image_drv = bdrv_find_format("raw");
> +        if (!drv) {
> +            ret = -1;
> +            goto out;
> +        }
> +        snprintf(image_filename, sizeof(image_filename),
> +            "%s"".ct.raw", out_filename);
> +        ret = bdrv_create(image_drv, image_filename, image_param);
> +        if (ret < 0) {
> +            error_report("%s: error while creating image_file: %s",
> +                     image_filename, strerror(-ret));
> +            goto out;
> +        }
> +        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
> +
> +        if (!out_baseimg) {
> +            backing_drv = bdrv_find_format("qcow2");
> +            if (!drv) {
> +                ret = -1;
> +                goto out;
> +            }
> +            snprintf(backing_filename, sizeof(backing_filename),
> +                "%s"".ct.qcow2", out_filename);
> +            ret = bdrv_create(backing_drv, backing_filename, image_param);
> +            if (ret < 0) {
> +                error_report("%s: error while creating backing_file: %s",
> +                         backing_filename, strerror(-ret));
> +                goto out;
> +            }
> +            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
> +                backing_filename);
> +        }
> +    }

If this diff hunk is dropped then the user needs to manually create
the raw file before running qemu-img convert?

qemu-img convert -O add-cow seems like a very rare case.  I'm not sure
we should add special user-friend hacks for this.

I'm not sure I understand why you create a qcow2 file either.

Stefan



reply via email to

[Prev in Thread] Current Thread [Next in Thread]