Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple

qemu-devel
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple

From:	Stefan Hajnoczi
Subject:	Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple args, strings.
Date:	Thu, 7 Jun 2012 15:32:28 +0100
On Thu, May 24, 2012 at 10:50 AM, Harsh Prateek Bora
<address@hidden> wrote:
> A newer tracelog format which gets rid of fixed size trace records and
> therefore allows to trace multiple arguments as well as strings in trace
> events.
>
> Sample trace:
> v9fs_version 0.000 tag=0xffff id=0x64 msize=0x2000 version=9P2000.L
> v9fs_version_return 6.705 tag=0xffff id=0x64 msize=0x2000 version=9P2000.L
> v9fs_attach 174.467 tag=0x1 id=0x68 fid=0x0 afid=0xffffffffffffffff
> uname=nobody aname=
> v9fs_attach_return 4720.454 tag=0x1 id=0x68 type=0xffffffffffffff80
> version=0x4f2a4dd0  path=0x220ea6
>
> Signed-off-by: Harsh Prateek Bora <address@hidden>
> ---
>  scripts/tracetool/backend/simple.py |   84 ++++++++++---
>  trace/simple.c                      |  229 
> ++++++++++++++++++++++-------------
>  trace/simple.h                      |   38 +++++-
>  3 files changed, 240 insertions(+), 111 deletions(-)

Sorry for the delay.  Mostly easy to fix comments, some care still
needed to avoid race conditions and use memory barriers in the right
places.

> diff --git a/trace/simple.c b/trace/simple.c
> index a6583d3..4e3ae65 100644
> --- a/trace/simple.c
> +++ b/trace/simple.c
> @@ -27,7 +27,7 @@
>  #define HEADER_MAGIC 0xf2b177cb0aa429b4ULL
>
>  /** Trace file version number, bump if format changes */
> -#define HEADER_VERSION 0
> +#define HEADER_VERSION 2
>
>  /** Records were dropped event ID */
>  #define DROPPED_EVENT_ID (~(uint64_t)0 - 1)
> @@ -35,23 +35,6 @@
>  /** Trace record is valid */
>  #define TRACE_RECORD_VALID ((uint64_t)1 << 63)
>
> -/** Trace buffer entry */
> -typedef struct {
> -    uint64_t event;
> -    uint64_t timestamp_ns;
> -    uint64_t x1;
> -    uint64_t x2;
> -    uint64_t x3;
> -    uint64_t x4;
> -    uint64_t x5;
> -    uint64_t x6;
> -} TraceRecord;
> -
> -enum {
> -    TRACE_BUF_LEN = 4096,
> -    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
> -};
> -
>  /*
>  * Trace records are written out by a dedicated thread.  The thread waits for
>  * records to become available, writes them out, and then waits again.
> @@ -62,11 +45,49 @@ static GCond *trace_empty_cond;
>  static bool trace_available;
>  static bool trace_writeout_enabled;
>
> -static TraceRecord trace_buf[TRACE_BUF_LEN];
> +enum {
> +    TRACE_BUF_LEN = 4096 * 64,
> +    TRACE_BUF_FLUSH_THRESHOLD = TRACE_BUF_LEN / 4,
> +};
> +
> +uint8_t trace_buf[TRACE_BUF_LEN];
>  static unsigned int trace_idx;
> +static unsigned int writeout_idx;
> +static uint64_t dropped_events;
>  static FILE *trace_fp;
>  static char *trace_file_name = NULL;
>
> +/* * Trace buffer entry */
> +typedef struct {
> +    uint64_t event; /*   TraceEventID */
> +    uint64_t timestamp_ns;
> +    uint32_t length;   /*    in bytes */
> +    uint32_t reserved; /*    unused */
> +    uint8_t arguments[]; /*  arguments position affects ST_REC_HDR_LEN */
> +} TraceRecord;
> +
> +typedef struct {
> +    uint64_t header_event_id; /* HEADER_EVENT_ID */
> +    uint64_t header_magic;    /* HEADER_MAGIC    */
> +    uint64_t header_version;  /* HEADER_VERSION  */
> +} TraceRecordHeader;
> +
> +/* * Trace record header length */
> +#define ST_REC_HDR_LEN sizeof(TraceRecord)

Why #define this?  It's confusing because TraceRecordHeader (notice
the word "header") is what comes to mind since this is named
ST_REC_HDR_LEN.  I suggest using sizeof(TraceRecord) explicitly in the
code and dropping this macro.

> +
> +int trace_alloc_record(TraceBufferRecord *rec, TraceEventID event, uint32_t 
> datasize);
> +static void read_from_buffer(unsigned int idx, uint8_t *dataptr, uint32_t 
> size);
> +static void write_to_buffer(unsigned int idx, uint8_t *dataptr, uint32_t 
> size);
> +void trace_mark_record_complete(TraceBufferRecord *rec);
> +
> +uint32_t safe_strlen(const char* str)
> +{
> +    if (str == NULL) {
> +        return 0;
> +    }
> +    return strlen(str);
> +}
> +
>  /**
>  * Read a trace record from the trace buffer
>  *
> @@ -75,16 +96,22 @@ static char *trace_file_name = NULL;
>  *
>  * Returns false if the record is not valid.
>  */
> -static bool get_trace_record(unsigned int idx, TraceRecord *record)
> +static bool get_trace_record(unsigned int idx, TraceRecord **recordptr)
>  {
> -    if (!(trace_buf[idx].event & TRACE_RECORD_VALID)) {
> +    uint8_t temp_rec[ST_REC_HDR_LEN];
> +    TraceRecord *record = (TraceRecord *) temp_rec;
> +    read_from_buffer(idx, temp_rec, ST_REC_HDR_LEN);
> +
> +    if (!(record->event & TRACE_RECORD_VALID)) {
>         return false;
>     }
>
>     __sync_synchronize(); /* read memory barrier before accessing record */

The need for the memory barrier is no longer clear.  Previously we
were directly accessing the trace ring buffer, and therefore needed to
ensure fields were settled before accessing them.  Now we use
read_from_buffer() which copies the data into our temporary struct on
the stack.

I think the best way of doing it is to read the event field first in a
separate step, then do the read memory barrier, and then read the rest
of the record.  This ensures that the event field is at least as "old"
as the other fields we read.

>
> -    *record = trace_buf[idx];
> -    record->event &= ~TRACE_RECORD_VALID;
> +    *recordptr = g_malloc(record->length);
> +    /* make a copy of record to avoid being overwritten */
> +    read_from_buffer(idx, (uint8_t *)*recordptr, record->length);
> +    (*recordptr)->event &= ~TRACE_RECORD_VALID;
>     return true;
>  }
>
> @@ -120,29 +147,31 @@ static void wait_for_trace_records_available(void)
>
>  static gpointer writeout_thread(gpointer opaque)
>  {
> -    TraceRecord record;
> -    unsigned int writeout_idx = 0;
> -    unsigned int num_available, idx;
> +    TraceRecord *recordptr;
> +    unsigned int idx = 0;
>     size_t unused __attribute__ ((unused));
>
>     for (;;) {
>         wait_for_trace_records_available();
>
> -        num_available = trace_idx - writeout_idx;
> -        if (num_available > TRACE_BUF_LEN) {
> -            record = (TraceRecord){
> -                .event = DROPPED_EVENT_ID,
> -                .x1 = num_available,
> -            };
> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
> -            writeout_idx += num_available;
> +        if (dropped_events) {
> +            recordptr = g_malloc(ST_REC_HDR_LEN + sizeof(dropped_events));
> +            recordptr->event = DROPPED_EVENT_ID,
> +            recordptr->timestamp_ns = get_clock();
> +            recordptr->length = ST_REC_HDR_LEN + sizeof(dropped_events),
> +            recordptr->reserved = 0;
> +            *(uint64_t *) &(recordptr->arguments[0]) = dropped_events;

memcpy(recordptr->arguments, &dropped_events, sizeof(dropped_events));

About the same ugliness but avoids the unaligned store.

> +            dropped_events = 0;
> +            unused = fwrite(recordptr, recordptr->length, 1, trace_fp);

recordptr is leaked.  It's easiest to allocate the dropped event
record on the stack instead.

>         }
>
> -        idx = writeout_idx % TRACE_BUF_LEN;
> -        while (get_trace_record(idx, &record)) {
> -            trace_buf[idx].event = 0; /* clear valid bit */
> -            unused = fwrite(&record, sizeof(record), 1, trace_fp);
> -            idx = ++writeout_idx % TRACE_BUF_LEN;
> +        while (get_trace_record(idx, &recordptr)) {
> +            unused = fwrite(recordptr, recordptr->length, 1, trace_fp);
> +            writeout_idx += recordptr->length;
> +            g_free(recordptr);
> +            recordptr = (TraceRecord *) &trace_buf[idx];
> +            recordptr->event = 0;

It would be cleaner for get_trace_record() to clear the
TRACE_RECORD_VALID bit.  This way the caller doesn't have to access
the raw trace buffer - and this is critical because recordptr->event
will write outside the bounds of the buffer when idx >= TRACE_BUF_LEN
- 4!

> +            idx = writeout_idx % TRACE_BUF_LEN;
>         }
>
>         fflush(trace_fp);
> @@ -150,72 +179,101 @@ static gpointer writeout_thread(gpointer opaque)
>     return NULL;
>  }
>
> -static void trace(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3,
> -                  uint64_t x4, uint64_t x5, uint64_t x6)
> +int trace_record_start(TraceBufferRecord *rec, TraceEventID id, size_t 
> arglen)
>  {
> -    unsigned int idx;
> -    uint64_t timestamp;
> -
> -    if (!trace_list[event].state) {
> -        return;
> -    }
> -
> -    timestamp = get_clock();
> -
> -    idx = g_atomic_int_exchange_and_add((gint *)&trace_idx, 1) % 
> TRACE_BUF_LEN;
> -    trace_buf[idx] = (TraceRecord){
> -        .event = event,
> -        .timestamp_ns = timestamp,
> -        .x1 = x1,
> -        .x2 = x2,
> -        .x3 = x3,
> -        .x4 = x4,
> -        .x5 = x5,
> -        .x6 = x6,
> -    };
> -    __sync_synchronize(); /* write barrier before marking as valid */
> -    trace_buf[idx].event |= TRACE_RECORD_VALID;
> -
> -    if ((idx + 1) % TRACE_BUF_FLUSH_THRESHOLD == 0) {
> -        flush_trace_file(false);
> -    }
> +    return trace_alloc_record(rec, id, arglen); /* return 0 on success */
>  }
>
> -void trace0(TraceEventID event)
> +void trace_record_write_u64(TraceBufferRecord *rec, uint64_t val)
>  {
> -    trace(event, 0, 0, 0, 0, 0, 0);
> +    write_to_buffer(rec->rec_off, (uint8_t *)&val, sizeof(uint64_t));

It's fine to make the second argument void* if you get tired of
casting to uint8_t.

> +    rec->rec_off += sizeof(uint64_t);
>  }
>
> -void trace1(TraceEventID event, uint64_t x1)
> +void trace_record_write_str(TraceBufferRecord *rec, const char *s)
>  {
> -    trace(event, x1, 0, 0, 0, 0, 0);
> +    /* Write string length first */
> +    uint32_t slen = (s == NULL ? 0 : strlen(s));
> +    write_to_buffer(rec->rec_off, (uint8_t *)&slen, sizeof(slen));
> +    rec->rec_off += sizeof(slen);
> +    /* Write actual string now */
> +    write_to_buffer(rec->rec_off, (uint8_t *)s, slen);
> +    rec->rec_off += slen;
>  }
>
> -void trace2(TraceEventID event, uint64_t x1, uint64_t x2)
> +void trace_record_finish(TraceBufferRecord *rec)
>  {
> -    trace(event, x1, x2, 0, 0, 0, 0);
> +    trace_mark_record_complete(rec);
>  }
>
> -void trace3(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3)
> +int trace_alloc_record(TraceBufferRecord *rec, TraceEventID event, uint32_t 
> datasize)
>  {
> -    trace(event, x1, x2, x3, 0, 0, 0);
> +    unsigned int idx, rec_off;
> +    uint32_t rec_len = ST_REC_HDR_LEN + datasize;
> +    uint64_t timestamp_ns = get_clock();
> +
> +    if ((rec_len + trace_idx - writeout_idx) > TRACE_BUF_LEN) {
> +        /* Trace Buffer Full, Event dropped ! */
> +        dropped_events++;
> +        return 1;

-ENOSPC?  A negative errno is clearer than 0 - success, 1 - failure.

> +    }
> +    idx = g_atomic_int_exchange_and_add((gint *)&trace_idx, rec_len) % 
> TRACE_BUF_LEN;

How does this deal with the race condition of two or more threads
running trace_alloc_record() concurrently when the buffer reaches max
size?  I think two or more threads could think they have enough space
because trace_idx hasn't been updated yet between the buffer length
check and the atomic update.

> +
> +    /*  To check later if threshold crossed */
> +    rec->next_tbuf_idx = trace_idx % TRACE_BUF_LEN;
> +
> +    rec_off = idx;
> +    write_to_buffer(rec_off, (uint8_t*)&event, sizeof(event));
> +    rec_off += sizeof(event);
> +    write_to_buffer(rec_off, (uint8_t*)&timestamp_ns, sizeof(timestamp_ns));
> +    rec_off += sizeof(timestamp_ns);
> +    write_to_buffer(rec_off, (uint8_t*)&rec_len, sizeof(rec_len));
> +    rec_off += sizeof(rec_len);

This suggests a pattern where write_to_buffer() returns the new index:

rec_off = write_to_buffer(rec_off, (uint8_t*)&event, sizeof(event));

That way you don't need to do the separate sizeof() additions.

> +
> +    rec->tbuf_idx = idx;
> +    rec->rec_off  = (idx + ST_REC_HDR_LEN) % TRACE_BUF_LEN;
> +    return 0;
>  }
>
> -void trace4(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, 
> uint64_t x4)
> +static void read_from_buffer(unsigned int idx, uint8_t *dataptr, uint32_t 
> size)

size_t is the type to use for memory size units.

>  {
> -    trace(event, x1, x2, x3, x4, 0, 0);
> +    uint32_t x = 0;
> +    while (x < size) {
> +        if (idx >= TRACE_BUF_LEN) {
> +            idx = idx % TRACE_BUF_LEN;
> +        }
> +        dataptr[x++] = trace_buf[idx++];
> +    }
>  }
>
> -void trace5(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, 
> uint64_t x4, uint64_t x5)
> +static void write_to_buffer(unsigned int idx, uint8_t *dataptr, uint32_t 
> size)
>  {
> -    trace(event, x1, x2, x3, x4, x5, 0);
> +    uint32_t x = 0;
> +    while (x < size) {
> +        if (idx >= TRACE_BUF_LEN) {
> +            idx = idx % TRACE_BUF_LEN;
> +        }
> +        trace_buf[idx++] = dataptr[x++];
> +    }
>  }
>
> -void trace6(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, 
> uint64_t x4, uint64_t x5, uint64_t x6)
> +void trace_mark_record_complete(TraceBufferRecord *rec)
>  {
> -    trace(event, x1, x2, x3, x4, x5, x6);
> +    uint8_t temp_rec[ST_REC_HDR_LEN];
> +    TraceRecord *record = (TraceRecord *) temp_rec;
> +    read_from_buffer(rec->tbuf_idx, temp_rec, ST_REC_HDR_LEN);
> +    __sync_synchronize(); /* write barrier before marking as valid */
> +    record->event |= TRACE_RECORD_VALID;
> +    write_to_buffer(rec->tbuf_idx, temp_rec, ST_REC_HDR_LEN);
> +
> +    if (rec->next_tbuf_idx > TRACE_BUF_FLUSH_THRESHOLD &&
> +        rec->tbuf_idx <= TRACE_BUF_FLUSH_THRESHOLD) {
> +        flush_trace_file(false);
> +    }
>  }
>
> +
> +
>  void st_set_trace_file_enabled(bool enable)
>  {
>     if (enable == !!trace_fp) {
> @@ -228,10 +286,11 @@ void st_set_trace_file_enabled(bool enable)
>     flush_trace_file(true);
>
>     if (enable) {
> -        static const TraceRecord header = {
> -            .event = HEADER_EVENT_ID,
> -            .timestamp_ns = HEADER_MAGIC,
> -            .x1 = HEADER_VERSION,
> +        static const TraceRecordHeader header = {
> +            .header_event_id = HEADER_EVENT_ID,
> +            .header_magic = HEADER_MAGIC,
> +            /* Older log readers will check for version at next location */
> +            .header_version = HEADER_VERSION,
>         };
>
>         trace_fp = fopen(trace_file_name, "wb");
> diff --git a/trace/simple.h b/trace/simple.h
> index 6b5358c..06ea778 100644
> --- a/trace/simple.h
> +++ b/trace/simple.h
> @@ -22,16 +22,40 @@ typedef struct {
>     bool state;
>  } TraceEvent;
>
> -void trace0(TraceEventID event);
> -void trace1(TraceEventID event, uint64_t x1);
> -void trace2(TraceEventID event, uint64_t x1, uint64_t x2);
> -void trace3(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3);
> -void trace4(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, 
> uint64_t x4);
> -void trace5(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, 
> uint64_t x4, uint64_t x5);
> -void trace6(TraceEventID event, uint64_t x1, uint64_t x2, uint64_t x3, 
> uint64_t x4, uint64_t x5, uint64_t x6);
>  void st_print_trace_file_status(FILE *stream, fprintf_function 
> stream_printf);
>  void st_set_trace_file_enabled(bool enable);
>  bool st_set_trace_file(const char *file);
>  void st_flush_trace_buffer(void);
>
> +typedef struct {
> +    unsigned int tbuf_idx;
> +    unsigned int next_tbuf_idx;
> +    unsigned int rec_off;
> +} TraceBufferRecord;
> +
> +/* *

Any reason why you use "/* *", usually it's "/**".

> + * Initialize a trace record and claim space for it in the buffer
> + *
> + * @arglen  number of bytes required for arguments
> + */
> +int trace_record_start(TraceBufferRecord *rec, TraceEventID id, size_t 
> arglen);
> +
> +/* *
> + * Append a 64-bit argument to a trace record
> + */
> +void trace_record_write_u64(TraceBufferRecord *rec, uint64_t val);
> +
> +/* *
> + * Append a string argument to a trace record
> + */
> +void trace_record_write_str(TraceBufferRecord *rec, const char *s);
> +
> +/* *
> + * Mark a trace record completed
> + *
> + * Don't append any more arguments to the trace record after calling this.
> + */
> +void trace_record_finish(TraceBufferRecord *rec);
> +uint32_t safe_strlen(const char* str);

Given that safe_strlen() is only used once and a generic utility (not
specific to the simple trace backend), I suggest dropping it an just
open coding the tristate operator: s ? strlen(s) : 0.

Stefan
[Prev in Thread]
Current Thread
[Next in Thread]
Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple args, strings., Stefan Hajnoczi <=
- Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple args, strings., Harsh Bora, 2012/06/11
  - Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple args, strings., Stefan Hajnoczi, 2012/06/11
    - Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple args, strings., Harsh Bora, 2012/06/12
Prev by Date: Re: [Qemu-devel] [PATCH 05/13] pci: Add pci_device_route_intx_to_irq
Next by Date: Re: [Qemu-devel] [PATCH v3 0/4] Standard SD host controller model
Previous by thread: Re: [Qemu-devel] [PATCH v5 5/5] add L2x0/PL310 cache controller device
Next by thread: Re: [Qemu-devel] [PATCH v2 2/3] Simpletrace v2: Add support for multiple args, strings.
Index(es):
- Date
- Thread