avr-libc-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [avr-libc-dev] Printing octal (brain dump)


From: Georg-Johann Lay
Subject: Re: [avr-libc-dev] Printing octal (brain dump)
Date: Mon, 19 Dec 2016 14:59:14 +0100
User-agent: Thunderbird 2.0.0.24 (Windows/20100228)

George Spelvin schrieb:
tl;dr: Beta code included below; it passes my initial testing.  I named
it _itoa_engine, following "ftoa_engine", but suggestions are welcome.
It is unfortunately one instruction longer than ultoa_invert:

   text    data     bss     dec     hex filename
    188       0       0     188      bc ultoa_invert.o
    190       0       0     190      be itoa_engine.o
    208       0       0     208      d0 itoa_engine.o (+OPTIMIZE_SPEED)
    212       0       0     212      d4 itoa_engine.o (without HAVE_MUL)

hmmm, that's quite some increase in code size...

IMO octal conversion is so remote from any practical purposes that we
should use an algorithm that works for all radices.

This can be done in less than 70 bytes (variable length / radix
algorithm that operates on memory, no need for MUL, needs strrev.

So all is boiling down to the question what execution time is
acceptable or not:

Is 8000 ticks too slow?

Is 3000 ticks acceptable? And for what reason? Are 3000 acceptable just
because we have an algorithm that performs in 3000 ticks?

My strong preference is still to have a one-fits-all algorithm that
might very well be slower than an optimal one.  But hey, an ordinary
division of a 64-bit value by 10 already costs 2300 cycles, so why
should we hunt cycles just for printf...?

Also I think it's a good idea to have conversion routines like
[u]lltoa, so we need generic conversions anyway.

The current __ultoa_invert is quite verbose and also adds a noticeable
amount of code.  It performs faster than ultoa from stdlib.h, but is
this essential?

I played around some more; attached is a version that operates on
memory, variable length and radices, and doesn't need MUL. It's
called mem_toa and needs < 70 bytes / 8000 ticks for decimal.
The signed version adds yet another 40 bytes:

#define __zero_reg__ r1
#define __tmp_reg__ r0

.macro DEFUN name
.global \name
.func \name
\name:
.endm

.macro ENDF name
.size \name, .-\name
.endfunc
.endm

#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP  jmp
#else
#define XCALL rcall
#define XJMP  rjmp
#endif

.macro  wmov  r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
    movw \r_dest,   \r_src
#else
    mov \r_dest,    \r_src
    mov \r_dest+1,  \r_src+1
#endif
.endm

;; End preamble

#define pNum    26
#define pBuf    30

#define nBytes  20
#define Radix   18

#define Num     19
#define Count   21
#define nBits   22
#define Rem     23

;; extern void mem_toa (char *pBuf, void *pNum, uint8_t nBytes, uint8_t Radix);
.section .text.mem_toa,"ax",@progbits

DEFUN mem_toa
    wmov    pBuf, 24
    wmov    pNum, 22

.LnextDigit:
    add     pNum, nBytes
    adc     pNum+1, __zero_reg__
    mov     Count, nBytes
    ;; nBytes < 0  <==>  nBytes is unknown
    ldi     nBytes, -1
    clr     Rem

.LnextByte:
    ld      Num, -X
    ldi     nBits, 8

.LnextBit:
    ;; Bit-wise construct (complement of) quotient into Num.
    ;; Bit-wise reconstruct Num into Rem.
    rol     Num
    rol     Rem

    ;; Reducing Rem mod Radix;  C = 0  <==>  reduction applied
    cp      Rem, Radix
    brcs    1f
    sub     Rem, Radix
1:
    dec     nBits
    brne    .LnextBit

    rol     Num
    com     Num
    st      X, Num

    breq    2f
    sbrc    nBytes, 7
    mov     nBytes, Count
2:
    dec     Count
    brne    .LnextByte

    subi    Rem, -'0'
    cpi     Rem, '9' + 1
    brlt    3f
    subi    Rem, '9' + 1 - 'a'
3:
    st      Z+, Rem

    sbrs    nBytes, 7
    rjmp    .LnextDigit

    st      Z, __zero_reg__

    ;; pBuf survived in R25:R24
    XJMP    strrev

ENDF mem_toa

#define Minus1 pBuf

;; extern void mem_toa_signed (char *pBuf, void *pNum, uint8_t nBytes, uint8_t Radix);
.section .text.mem_toa_signed,"ax",@progbits

DEFUN mem_toa_signed
    wmov    pNum, 22
    add     pNum, nBytes
    adc     pNum+1, __zero_reg__
    ld      Num, -X
    tst     Num
    brpl    9f

    ;; Negate pNum[] according to -N = ~N + 1 = ~N - (-1).
    ;; Carry is clear now because ADC above didn't overflow.
    wmov    pNum, 22
    mov     Count, nBytes
    ldi     Minus1, -1
1:
    ld      Num, X
    ;; Must use EOR for complement because COM clobbers carry.
    eor     Num, Minus1
    sbci    Num, -1
    st      X+, Num
    dec     Count
    brne    1b

    wmov    pBuf, 24
    ldi     Num, '-'
    st      Z, Num
    adiw    24, 1
9:
    XJMP    mem_toa

ENDF mem_toa_signed

#undef Minus1
#undef pNum
#undef pBuf

#undef nBytes
#undef Radix

#undef Rem
#undef Num
#undef Count
#undef nBits



For purely decimal there's a version consuming 90 bytes only,
requiring MUL and which is only 10% slower than yours:

(Using the same preamble as the code above)

.section .text.u64toa_nibbles,"ax",@progbits

#ifdef __AVR_HAVE_MUL__
#define SPEED 1
#else
#define SPEED 0
#endif

#define pBuf    26
#define pNum    30
#define Length  r20

#define Rem     r19
#define Quot    r21
#define Count   r22
#define Num     r23

#if SPEED
#   define Ten     r16
#   define Ox67    r18
#endif

;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]

DEFUN u64toa_nibbles

    wmov    pBuf, 24
    wmov    pNum, 22

#if SPEED
    push    Ten
    ldi     Ten, 10
;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10,
    ;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
    ;; also works, and that saves us one "lsr r1" per division.
    ldi     Ox67, 0x67
    clr     Count
#endif

.LoopDigits:

    add     pNum, Length
#if SPEED
    ;; Count is 0.
    adc     pNum+1, Count
#else
    adc     pNum+1, __zero_reg__
#endif

    mov     Count, Length
    clt     ; Length unknown
    clr     Rem

.LoopBytes:

    ld      Num, -Z

#if SPEED

    ;; Rem:Num <<= 4
    ;; "andi Rem, 0xf" not needed because Rem < 10.
    eor     Rem, Num
    andi    Num, 0x0f
    eor     Rem, Num
    swap    Rem

    ;; Quot := Rem / 10
    mul     Rem, Ox67
    lsr     r1
    lsr     r1
    mov     Quot, r1

    ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
    mul     r1, Ten
    sub     Rem, r0

    ;; ...and (mostly) the same again for the low nibble.

    ;; Quot <<= 4
    swap    Quot

    ;; Rem:Num <<= 4
    swap    Rem
    eor     Rem, Num

    ;; Quot += Rem / 10
    mul     Rem, Ox67
    lsr     r1
    lsr     r1
    add     Quot, r1

    brts    0f
    ;; Quot = 0  ==>  R1 = 0, hence we can also skip the MUL + SUB below.
    breq    1f
;; Current Length not yet known: Set it according to highest non-zero byte.
    set
    mov     Length, Count
0:
    ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
    mul     r1, Ten
    sub     Rem, r0
1:
#else // !SPEED

    clr     Quot

.LoopNibbles:
    ;; Quot <<= 4
    swap    Quot

    ;; Rem:Num <<= 4
    swap    Num
    swap    Rem
    ;; "andi Rem, 0xf0" not needed because Rem was < 10.
    eor     Rem, Num
    andi    Num, 0xf0
    eor     Rem, Num

1:  subi    Quot, -4
    subi    Rem, 40
    brcc    1b
    subi    Rem, -40
    ;; "+1" avoids "dec Quot" after the next loop.
    subi    Quot, 4 + 1

2:  inc     Quot
    subi    Rem, 10
    brcc    2b
    subi    Rem, -10
    ;; dec     Quot

    subi    Count, 0x80
    brmi    .LoopNibbles

    brts    0f
    tst     Quot
    breq    0f
;; Current Length not yet known: Set it according to highest non-zero byte.
    set
    mov     Length, Count
0:
#endif // SPEED

    st      Z, Quot

    dec     Count
    brne    .LoopBytes

    subi    Rem, -'0'
    st      X+, Rem

    ;; Only continue if we determined Length (T=1).
    ;; T=0 means all bytes are zero, hence we are done then.
    brts    .LoopDigits

#if SPEED
    pop     Ten
    clr     __zero_reg__
#endif

    st      X, __zero_reg__

    ;; pBuf survived in R25:R24
    XJMP    strrev
ENDF u64toa_nibbles

#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length
#undef SPEED





    230       0       0     230      e6 itoa_engine.o (+OPTIMIZE_SPEED)

but I'm reasonably satisfied.


Well, it turned out that my great idea from last time had a nasty
bug in it.  There's no really good way to check for termination after
printing each digit.

Recall that my bit buffer only has room for 15 bits of the number, so I
arrange for it to always hold 8 <= n <= 15 bits.  When it's about to fall
from 8 to 7, I fetch another byte and decrement it from 16 to 15 instead.

The problem is that there are two ways to handle the length, and
both of them have problems.

One is to count so len=1 after fetching the last byte of input, and
len=0 after fetching the first byte of padding.

In that case, consider printing one byte of zero in hex.

I start with 8 bits in the buffer (msbyte = 0x01), and len = 1.  I print a
'0', and check if len == 0.  It's not, so I keep going and print a second
zero before realizing that I should stop.


Okay, that doesn't work, so what if I arrange so len counts the bytes
remaining?  len=0 after fetching the last byte, and it holds there
after fetching padding.

In that case, there are no extra digits, but there are missing
digits.  Consider printing 2^15 = 0x8000.  I start out with
len=1 and the last byte in the bit buffer.  I print the final 0, then
have to fetch another byte to start shifting bits into the lsbyte.

So the bit buffer holds 800 (msbyte = 0x18, lsbyte = 0x00, len = 0) when
I print the second 0.  At this point, len == 0 and lsbyte == 0, so
I should stop printing.  But there are more bits in the msbyte.

There are various tricks you can try in hex, but they break down in
octal.  Checking for extra bits in the msbyte makes for messy
(= large) code.


So I went back to my original idea of checing for termination when
fetching padding.  It's a couple of instructions longer, but it works.

Anyway, here's the current state of the function I intend to rewrite
vfprintf() around.  Any comments?  I know about a few mechanical things:
I need to integrate it into avr-libc and #include the appropriate headers
rather than #define things myself.  Likewise X_movw and so on.


[...]

        or      __tmp_reg__, __tmp_reg__

There's the shortcut "TST Reg, Reg" to test for sign and zero.  It's
actually encoded as "AND Reg, Reg" so for any practical purposes it has
the same effect (and purpose) as "OR Reg, Reg".

Johann
#define __zero_reg__ r1
#define __tmp_reg__ r0

.macro DEFUN name
.global \name
.func \name
\name:
.endm

.macro ENDF name
.size \name, .-\name
.endfunc
.endm

#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP  jmp
#else
#define XCALL rcall
#define XJMP  rjmp
#endif

.macro  wmov  r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
    movw \r_dest,   \r_src
#else
    mov \r_dest,    \r_src
    mov \r_dest+1,  \r_src+1
#endif
.endm

.text

.section .text.u64toa_nibbles,"ax",@progbits

#ifdef __AVR_HAVE_MUL__
#define SPEED 1
#else
#define SPEED 0
#endif

#define pBuf    26
#define pNum    30
#define Length  r20

#define Rem     r19
#define Quot    r21
#define Count   r22
#define Num     r23

#if SPEED
#   define Ten     r16
#   define Ox67    r18
#endif

;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]

DEFUN u64toa_nibbles

    wmov    pBuf, 24
    wmov    pNum, 22

#if SPEED
    push    Ten
    ldi     Ten, 10
    ;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10,
    ;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
    ;; also works, and that saves us one "lsr r1" per division.
    ldi     Ox67, 0x67
    clr     Count
#endif

.LoopDigits:

    add     pNum, Length
#if SPEED
    ;; Count is 0.
    adc     pNum+1, Count
#else
    adc     pNum+1, __zero_reg__
#endif

    mov     Count, Length
    clt     ; Length unknown
    clr     Rem

.LoopBytes:

    ld      Num, -Z

#if SPEED

    ;; Rem:Num <<= 4
    ;; "andi Rem, 0xf" not needed because Rem < 10.
    eor     Rem, Num
    andi    Num, 0x0f
    eor     Rem, Num
    swap    Rem

    ;; Quot := Rem / 10
    mul     Rem, Ox67
    lsr     r1
    lsr     r1
    mov     Quot, r1

    ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
    mul     r1, Ten
    sub     Rem, r0

    ;; ...and (mostly) the same again for the low nibble.

    ;; Quot <<= 4
    swap    Quot

    ;; Rem:Num <<= 4
    swap    Rem
    eor     Rem, Num

    ;; Quot += Rem / 10
    mul     Rem, Ox67
    lsr     r1
    lsr     r1
    add     Quot, r1

    brts    0f
    ;; Quot = 0  ==>  R1 = 0, hence we can also skip the MUL + SUB below.
    breq    1f
    ;; Current Length not yet known: Set it according to highest non-zero byte.
    set
    mov     Length, Count
0:
    ;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
    mul     r1, Ten
    sub     Rem, r0
1:
#else // !SPEED

    clr     Quot

.LoopNibbles:
    ;; Quot <<= 4
    swap    Quot

    ;; Rem:Num <<= 4
    swap    Num
    swap    Rem
    ;; "andi Rem, 0xf0" not needed because Rem was < 10.
    eor     Rem, Num
    andi    Num, 0xf0
    eor     Rem, Num

1:  subi    Quot, -4
    subi    Rem, 40
    brcc    1b
    subi    Rem, -40
    ;; "+1" avoids "dec Quot" after the next loop.
    subi    Quot, 4 + 1

2:  inc     Quot
    subi    Rem, 10
    brcc    2b
    subi    Rem, -10
    ;; dec     Quot

    subi    Count, 0x80
    brmi    .LoopNibbles

    brts    0f
    tst     Quot
    breq    0f
    ;; Current Length not yet known: Set it according to highest non-zero byte.
    set
    mov     Length, Count
0:
#endif // SPEED

    st      Z, Quot

    dec     Count
    brne    .LoopBytes

    subi    Rem, -'0'
    st      X+, Rem

    ;; Only continue if we determined Length (T=1).
    ;; T=0 means all bytes are zero, hence we are done then.
    brts    .LoopDigits

#if SPEED
    pop     Ten
    clr     __zero_reg__
#endif

    st      X, __zero_reg__

    ;; pBuf survived in R25:R24
    XJMP    strrev
ENDF u64toa_nibbles

#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length
#undef SPEED

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


#define pNum    26
#define pBuf    30

#define nBytes  20
#define Radix   18

#define Num     19
#define Count   21
#define nBits   22
#define Rem     23

;; extern void mem_toa (char *pBuf, void *pNum, uint8_t nBytes, uint8_t Radix);
.section .text.mem_toa,"ax",@progbits

DEFUN mem_toa
    wmov    pBuf, 24
    wmov    pNum, 22

.LnextDigit:
    add     pNum, nBytes
    adc     pNum+1, __zero_reg__
    mov     Count, nBytes
    ;; nBytes < 0  <==>  nBytes is unknown
    ldi     nBytes, -1
    clr     Rem

.LnextByte:
    ld      Num, -X
    ldi     nBits, 8

.LnextBit:
    ;; Bit-wise construct (complement of) quotient into Num.
    ;; Bit-wise reconstruct Num into Rem.
    rol     Num
    rol     Rem

    ;; Reducing Rem mod Radix;  C = 0  <==>  reduction applied
    cp      Rem, Radix
    brcs    1f
    sub     Rem, Radix
1:
    dec     nBits
    brne    .LnextBit

    rol     Num
    com     Num
    st      X, Num

    breq    2f
    sbrc    nBytes, 7
    mov     nBytes, Count
2:
    dec     Count
    brne    .LnextByte

    subi    Rem, -'0'
    cpi     Rem, '9' + 1
    brlt    3f
    subi    Rem, '9' + 1 - 'a'
3:
    st      Z+, Rem

    sbrs    nBytes, 7
    rjmp    .LnextDigit

    st      Z, __zero_reg__

    ;; pBuf survived in R25:R24
    XJMP    strrev

ENDF mem_toa

#define Minus1 pBuf

;; extern void mem_toa_signed (char *pBuf, void *pNum, uint8_t nBytes, uint8_t 
Radix);
.section .text.mem_toa_signed,"ax",@progbits

DEFUN mem_toa_signed
    wmov    pNum, 22
    add     pNum, nBytes
    adc     pNum+1, __zero_reg__
    ld      Num, -X
    tst     Num
    brpl    9f

    ;; Negate pNum[] according to -N = ~N + 1 = ~N - (-1).
    ;; Carry is clear now because ADC above didn't overflow.
    wmov    pNum, 22
    mov     Count, nBytes
    ldi     Minus1, -1
1:
    ld      Num, X
    ;; Must use EOR for complement because COM clobbers carry.
    eor     Num, Minus1
    sbci    Num, -1
    st      X+, Num
    dec     Count
    brne    1b

    wmov    pBuf, 24
    ldi     Num, '-'
    st      Z, Num
    adiw    24, 1
9:
    XJMP    mem_toa

ENDF mem_toa_signed

#undef Minus1
#undef pNum
#undef pBuf

#undef nBytes
#undef Radix

#undef Rem
#undef Num
#undef Count
#undef nBits

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

.section .text.put64,"ax",@progbits


#define A0      18
#define A1      A0 + 1
#define A2      A0 + 2
#define A3      A0 + 3
#define A4      A0 + 4
#define A5      A0 + 5
#define A6      A0 + 6
#define A7      A0 + 7

#define BUF0    26
#define BUF1    BUF0 + 1

#define Digit   31

#define Carry   31
#define Ten     30
#define Count   29
#define nonZero 28

;;; extern void put64 (uint64_t A, char *buf);
;;; This function writes up to 21 characters (including final '\0') to buf.

DEFUN put64
    push    r28
    push    r29

    wmov    BUF0, 16
    ldi     Count, 19
    ldi     nonZero, '0'

.Loop:
    ;; A[] -= 1.000.000.000.000.000.000  as often as we can.
    ;; Digit will hold the net number of subtractions (plus '0').
    ldi     Digit, '0'-1
1:
    inc     Digit
    ;;  1.000.000.000.000.000.000 = 0DE0 B6B3 A764 0000
    subi    A2, 0x64
    sbci    A3, 0xa7
    sbci    A4, 0xb3
    sbci    A5, 0xb6
    sbci    A6, 0xe0
    sbci    A7, 0xd
    brcc    1b

    ;; -1.000.000.000.000.000.000 = F21F 494C 589C 0000
    ;; Undo the underflow from above
    subi    A2, 0x9c
    sbci    A3, 0x58
    sbci    A4, 0x4c
    sbci    A5, 0x49
    sbci    A6, 0x1f
    sbci    A7, 0xf2

    ;; As ffff ffff ffff ffff = 18.***.***.***.***.***.***  we can get
    ;; Digit up to 18+'0' in the first round.
    cpi     Digit, '0'+10
    brlo    2f

    ;; If that is the case, split it into 2 digits: a '1' and the
    ;; low part in Digit.
    ldi     nonZero, '1'
    st      X+, nonZero
    subi    Digit, 10
2:
    ;; nonZero "accumulates" digits: it holds a value != '0' if we ever
    ;; output a digit not equal to '0'.
    or      nonZero, Digit
    ;; We only output digits if we saw a digit != '0', i.e. strip leading '0's.
    cpi     nonZero, '0'
    breq    3f
    st      X+, Digit
3:
    ;; A[] *= 10
    ldi     Ten, 10
    clr     Carry
.Lrot:
    mul     A0, Ten
    add     r0, Carry
    mov     Carry, r1
    clr     __zero_reg__
    adc     Carry, __zero_reg__
    ;; A[] >>>= 8
    ;; Rotate 8 times so that we can loop the *= 10 over A[].
    ;; The current value of A0 resides in r0, hence no "mov r0,A0" needed
    ;; to start the rotation.
    mov A0,A1  $  mov A1,A2  $  mov A2,A3  $  mov A3,A4
    mov A4,A5  $  mov A5,A6  $  mov A6,A7  $  mov A7,r0

    ;; Use the upper 3 bits of Count as loop counter for 8 loops.
    subi    Count, -32
    brcs    .Lrot

    dec     Count
    brne    .Loop

    ;; If all digits were '0', we had A[] = 0:  Output one '0'.
    cpi     nonZero, '0'
    brne    4f
    st      X+, nonZero
4:
    ;; String end and epilogue.
    st      X+, __zero_reg__
    pop     r29
    pop     r28
    ret
ENDF put64

.section .text.put64s,"ax",@progbits

;;; extern void put64s (int64_t A, char *buf);
;;; Writes up to 21 characters (including '-' and final '\0') to buf.

DEFUN put64s
#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
    tst     A7
    brmi    0f
#else
    sbrs    A7, 7
#endif
    XJMP    put64
0:
    push    r16
    push    r17
    XCALL   __negdi2            ; Assumes avr-gcc 4.7+
    wmov    BUF0, 16
    ldi     Digit, '-'
    st      X+, Digit
    wmov    16, BUF0
    XCALL   put64
    pop     r17
    pop     r16
    ret
ENDF put64s


#undef A0
#undef A1
#undef A2
#undef A3
#undef A4
#undef A5
#undef A6
#undef A7

#undef Digit
#undef BUF0
#undef Ten
#undef Count
#undef Carry
#undef nonZero

reply via email to

[Prev in Thread] Current Thread [Next in Thread]