[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [avr-libc-dev] Printing octal (brain dump)
From: |
Georg-Johann Lay |
Subject: |
Re: [avr-libc-dev] Printing octal (brain dump) |
Date: |
Mon, 19 Dec 2016 14:59:14 +0100 |
User-agent: |
Thunderbird 2.0.0.24 (Windows/20100228) |
George Spelvin schrieb:
tl;dr: Beta code included below; it passes my initial testing. I named
it _itoa_engine, following "ftoa_engine", but suggestions are welcome.
It is unfortunately one instruction longer than ultoa_invert:
text data bss dec hex filename
188 0 0 188 bc ultoa_invert.o
190 0 0 190 be itoa_engine.o
208 0 0 208 d0 itoa_engine.o (+OPTIMIZE_SPEED)
212 0 0 212 d4 itoa_engine.o (without HAVE_MUL)
hmmm, that's quite some increase in code size...
IMO octal conversion is so remote from any practical purposes that we
should use an algorithm that works for all radices.
This can be done in less than 70 bytes (variable length / radix
algorithm that operates on memory, no need for MUL, needs strrev.
So all is boiling down to the question what execution time is
acceptable or not:
Is 8000 ticks too slow?
Is 3000 ticks acceptable? And for what reason? Are 3000 acceptable just
because we have an algorithm that performs in 3000 ticks?
My strong preference is still to have a one-fits-all algorithm that
might very well be slower than an optimal one. But hey, an ordinary
division of a 64-bit value by 10 already costs 2300 cycles, so why
should we hunt cycles just for printf...?
Also I think it's a good idea to have conversion routines like
[u]lltoa, so we need generic conversions anyway.
The current __ultoa_invert is quite verbose and also adds a noticeable
amount of code. It performs faster than ultoa from stdlib.h, but is
this essential?
I played around some more; attached is a version that operates on
memory, variable length and radices, and doesn't need MUL. It's
called mem_toa and needs < 70 bytes / 8000 ticks for decimal.
The signed version adds yet another 40 bytes:
#define __zero_reg__ r1
#define __tmp_reg__ r0
.macro DEFUN name
.global \name
.func \name
\name:
.endm
.macro ENDF name
.size \name, .-\name
.endfunc
.endm
#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP jmp
#else
#define XCALL rcall
#define XJMP rjmp
#endif
.macro wmov r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
movw \r_dest, \r_src
#else
mov \r_dest, \r_src
mov \r_dest+1, \r_src+1
#endif
.endm
;; End preamble
#define pNum 26
#define pBuf 30
#define nBytes 20
#define Radix 18
#define Num 19
#define Count 21
#define nBits 22
#define Rem 23
;; extern void mem_toa (char *pBuf, void *pNum, uint8_t nBytes, uint8_t
Radix);
.section .text.mem_toa,"ax",@progbits
DEFUN mem_toa
wmov pBuf, 24
wmov pNum, 22
.LnextDigit:
add pNum, nBytes
adc pNum+1, __zero_reg__
mov Count, nBytes
;; nBytes < 0 <==> nBytes is unknown
ldi nBytes, -1
clr Rem
.LnextByte:
ld Num, -X
ldi nBits, 8
.LnextBit:
;; Bit-wise construct (complement of) quotient into Num.
;; Bit-wise reconstruct Num into Rem.
rol Num
rol Rem
;; Reducing Rem mod Radix; C = 0 <==> reduction applied
cp Rem, Radix
brcs 1f
sub Rem, Radix
1:
dec nBits
brne .LnextBit
rol Num
com Num
st X, Num
breq 2f
sbrc nBytes, 7
mov nBytes, Count
2:
dec Count
brne .LnextByte
subi Rem, -'0'
cpi Rem, '9' + 1
brlt 3f
subi Rem, '9' + 1 - 'a'
3:
st Z+, Rem
sbrs nBytes, 7
rjmp .LnextDigit
st Z, __zero_reg__
;; pBuf survived in R25:R24
XJMP strrev
ENDF mem_toa
#define Minus1 pBuf
;; extern void mem_toa_signed (char *pBuf, void *pNum, uint8_t nBytes,
uint8_t Radix);
.section .text.mem_toa_signed,"ax",@progbits
DEFUN mem_toa_signed
wmov pNum, 22
add pNum, nBytes
adc pNum+1, __zero_reg__
ld Num, -X
tst Num
brpl 9f
;; Negate pNum[] according to -N = ~N + 1 = ~N - (-1).
;; Carry is clear now because ADC above didn't overflow.
wmov pNum, 22
mov Count, nBytes
ldi Minus1, -1
1:
ld Num, X
;; Must use EOR for complement because COM clobbers carry.
eor Num, Minus1
sbci Num, -1
st X+, Num
dec Count
brne 1b
wmov pBuf, 24
ldi Num, '-'
st Z, Num
adiw 24, 1
9:
XJMP mem_toa
ENDF mem_toa_signed
#undef Minus1
#undef pNum
#undef pBuf
#undef nBytes
#undef Radix
#undef Rem
#undef Num
#undef Count
#undef nBits
For purely decimal there's a version consuming 90 bytes only,
requiring MUL and which is only 10% slower than yours:
(Using the same preamble as the code above)
.section .text.u64toa_nibbles,"ax",@progbits
#ifdef __AVR_HAVE_MUL__
#define SPEED 1
#else
#define SPEED 0
#endif
#define pBuf 26
#define pNum 30
#define Length r20
#define Rem r19
#define Quot r21
#define Count r22
#define Num r23
#if SPEED
# define Ten r16
# define Ox67 r18
#endif
;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
DEFUN u64toa_nibbles
wmov pBuf, 24
wmov pNum, 22
#if SPEED
push Ten
ldi Ten, 10
;; For / 10 by multiply we would usually use a value of 0xcd ~
0x800/10,
;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
;; also works, and that saves us one "lsr r1" per division.
ldi Ox67, 0x67
clr Count
#endif
.LoopDigits:
add pNum, Length
#if SPEED
;; Count is 0.
adc pNum+1, Count
#else
adc pNum+1, __zero_reg__
#endif
mov Count, Length
clt ; Length unknown
clr Rem
.LoopBytes:
ld Num, -Z
#if SPEED
;; Rem:Num <<= 4
;; "andi Rem, 0xf" not needed because Rem < 10.
eor Rem, Num
andi Num, 0x0f
eor Rem, Num
swap Rem
;; Quot := Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
mov Quot, r1
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
;; ...and (mostly) the same again for the low nibble.
;; Quot <<= 4
swap Quot
;; Rem:Num <<= 4
swap Rem
eor Rem, Num
;; Quot += Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
add Quot, r1
brts 0f
;; Quot = 0 ==> R1 = 0, hence we can also skip the MUL + SUB below.
breq 1f
;; Current Length not yet known: Set it according to highest
non-zero byte.
set
mov Length, Count
0:
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
1:
#else // !SPEED
clr Quot
.LoopNibbles:
;; Quot <<= 4
swap Quot
;; Rem:Num <<= 4
swap Num
swap Rem
;; "andi Rem, 0xf0" not needed because Rem was < 10.
eor Rem, Num
andi Num, 0xf0
eor Rem, Num
1: subi Quot, -4
subi Rem, 40
brcc 1b
subi Rem, -40
;; "+1" avoids "dec Quot" after the next loop.
subi Quot, 4 + 1
2: inc Quot
subi Rem, 10
brcc 2b
subi Rem, -10
;; dec Quot
subi Count, 0x80
brmi .LoopNibbles
brts 0f
tst Quot
breq 0f
;; Current Length not yet known: Set it according to highest
non-zero byte.
set
mov Length, Count
0:
#endif // SPEED
st Z, Quot
dec Count
brne .LoopBytes
subi Rem, -'0'
st X+, Rem
;; Only continue if we determined Length (T=1).
;; T=0 means all bytes are zero, hence we are done then.
brts .LoopDigits
#if SPEED
pop Ten
clr __zero_reg__
#endif
st X, __zero_reg__
;; pBuf survived in R25:R24
XJMP strrev
ENDF u64toa_nibbles
#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length
#undef SPEED
230 0 0 230 e6 itoa_engine.o (+OPTIMIZE_SPEED)
but I'm reasonably satisfied.
Well, it turned out that my great idea from last time had a nasty
bug in it. There's no really good way to check for termination after
printing each digit.
Recall that my bit buffer only has room for 15 bits of the number, so I
arrange for it to always hold 8 <= n <= 15 bits. When it's about to fall
from 8 to 7, I fetch another byte and decrement it from 16 to 15 instead.
The problem is that there are two ways to handle the length, and
both of them have problems.
One is to count so len=1 after fetching the last byte of input, and
len=0 after fetching the first byte of padding.
In that case, consider printing one byte of zero in hex.
I start with 8 bits in the buffer (msbyte = 0x01), and len = 1. I print a
'0', and check if len == 0. It's not, so I keep going and print a second
zero before realizing that I should stop.
Okay, that doesn't work, so what if I arrange so len counts the bytes
remaining? len=0 after fetching the last byte, and it holds there
after fetching padding.
In that case, there are no extra digits, but there are missing
digits. Consider printing 2^15 = 0x8000. I start out with
len=1 and the last byte in the bit buffer. I print the final 0, then
have to fetch another byte to start shifting bits into the lsbyte.
So the bit buffer holds 800 (msbyte = 0x18, lsbyte = 0x00, len = 0) when
I print the second 0. At this point, len == 0 and lsbyte == 0, so
I should stop printing. But there are more bits in the msbyte.
There are various tricks you can try in hex, but they break down in
octal. Checking for extra bits in the msbyte makes for messy
(= large) code.
So I went back to my original idea of checing for termination when
fetching padding. It's a couple of instructions longer, but it works.
Anyway, here's the current state of the function I intend to rewrite
vfprintf() around. Any comments? I know about a few mechanical things:
I need to integrate it into avr-libc and #include the appropriate headers
rather than #define things myself. Likewise X_movw and so on.
[...]
or __tmp_reg__, __tmp_reg__
There's the shortcut "TST Reg, Reg" to test for sign and zero. It's
actually encoded as "AND Reg, Reg" so for any practical purposes it has
the same effect (and purpose) as "OR Reg, Reg".
Johann
#define __zero_reg__ r1
#define __tmp_reg__ r0
.macro DEFUN name
.global \name
.func \name
\name:
.endm
.macro ENDF name
.size \name, .-\name
.endfunc
.endm
#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP jmp
#else
#define XCALL rcall
#define XJMP rjmp
#endif
.macro wmov r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
movw \r_dest, \r_src
#else
mov \r_dest, \r_src
mov \r_dest+1, \r_src+1
#endif
.endm
.text
.section .text.u64toa_nibbles,"ax",@progbits
#ifdef __AVR_HAVE_MUL__
#define SPEED 1
#else
#define SPEED 0
#endif
#define pBuf 26
#define pNum 30
#define Length r20
#define Rem r19
#define Quot r21
#define Count r22
#define Num r23
#if SPEED
# define Ten r16
# define Ox67 r18
#endif
;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
DEFUN u64toa_nibbles
wmov pBuf, 24
wmov pNum, 22
#if SPEED
push Ten
ldi Ten, 10
;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10,
;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
;; also works, and that saves us one "lsr r1" per division.
ldi Ox67, 0x67
clr Count
#endif
.LoopDigits:
add pNum, Length
#if SPEED
;; Count is 0.
adc pNum+1, Count
#else
adc pNum+1, __zero_reg__
#endif
mov Count, Length
clt ; Length unknown
clr Rem
.LoopBytes:
ld Num, -Z
#if SPEED
;; Rem:Num <<= 4
;; "andi Rem, 0xf" not needed because Rem < 10.
eor Rem, Num
andi Num, 0x0f
eor Rem, Num
swap Rem
;; Quot := Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
mov Quot, r1
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
;; ...and (mostly) the same again for the low nibble.
;; Quot <<= 4
swap Quot
;; Rem:Num <<= 4
swap Rem
eor Rem, Num
;; Quot += Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
add Quot, r1
brts 0f
;; Quot = 0 ==> R1 = 0, hence we can also skip the MUL + SUB below.
breq 1f
;; Current Length not yet known: Set it according to highest non-zero byte.
set
mov Length, Count
0:
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
1:
#else // !SPEED
clr Quot
.LoopNibbles:
;; Quot <<= 4
swap Quot
;; Rem:Num <<= 4
swap Num
swap Rem
;; "andi Rem, 0xf0" not needed because Rem was < 10.
eor Rem, Num
andi Num, 0xf0
eor Rem, Num
1: subi Quot, -4
subi Rem, 40
brcc 1b
subi Rem, -40
;; "+1" avoids "dec Quot" after the next loop.
subi Quot, 4 + 1
2: inc Quot
subi Rem, 10
brcc 2b
subi Rem, -10
;; dec Quot
subi Count, 0x80
brmi .LoopNibbles
brts 0f
tst Quot
breq 0f
;; Current Length not yet known: Set it according to highest non-zero byte.
set
mov Length, Count
0:
#endif // SPEED
st Z, Quot
dec Count
brne .LoopBytes
subi Rem, -'0'
st X+, Rem
;; Only continue if we determined Length (T=1).
;; T=0 means all bytes are zero, hence we are done then.
brts .LoopDigits
#if SPEED
pop Ten
clr __zero_reg__
#endif
st X, __zero_reg__
;; pBuf survived in R25:R24
XJMP strrev
ENDF u64toa_nibbles
#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length
#undef SPEED
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#define pNum 26
#define pBuf 30
#define nBytes 20
#define Radix 18
#define Num 19
#define Count 21
#define nBits 22
#define Rem 23
;; extern void mem_toa (char *pBuf, void *pNum, uint8_t nBytes, uint8_t Radix);
.section .text.mem_toa,"ax",@progbits
DEFUN mem_toa
wmov pBuf, 24
wmov pNum, 22
.LnextDigit:
add pNum, nBytes
adc pNum+1, __zero_reg__
mov Count, nBytes
;; nBytes < 0 <==> nBytes is unknown
ldi nBytes, -1
clr Rem
.LnextByte:
ld Num, -X
ldi nBits, 8
.LnextBit:
;; Bit-wise construct (complement of) quotient into Num.
;; Bit-wise reconstruct Num into Rem.
rol Num
rol Rem
;; Reducing Rem mod Radix; C = 0 <==> reduction applied
cp Rem, Radix
brcs 1f
sub Rem, Radix
1:
dec nBits
brne .LnextBit
rol Num
com Num
st X, Num
breq 2f
sbrc nBytes, 7
mov nBytes, Count
2:
dec Count
brne .LnextByte
subi Rem, -'0'
cpi Rem, '9' + 1
brlt 3f
subi Rem, '9' + 1 - 'a'
3:
st Z+, Rem
sbrs nBytes, 7
rjmp .LnextDigit
st Z, __zero_reg__
;; pBuf survived in R25:R24
XJMP strrev
ENDF mem_toa
#define Minus1 pBuf
;; extern void mem_toa_signed (char *pBuf, void *pNum, uint8_t nBytes, uint8_t
Radix);
.section .text.mem_toa_signed,"ax",@progbits
DEFUN mem_toa_signed
wmov pNum, 22
add pNum, nBytes
adc pNum+1, __zero_reg__
ld Num, -X
tst Num
brpl 9f
;; Negate pNum[] according to -N = ~N + 1 = ~N - (-1).
;; Carry is clear now because ADC above didn't overflow.
wmov pNum, 22
mov Count, nBytes
ldi Minus1, -1
1:
ld Num, X
;; Must use EOR for complement because COM clobbers carry.
eor Num, Minus1
sbci Num, -1
st X+, Num
dec Count
brne 1b
wmov pBuf, 24
ldi Num, '-'
st Z, Num
adiw 24, 1
9:
XJMP mem_toa
ENDF mem_toa_signed
#undef Minus1
#undef pNum
#undef pBuf
#undef nBytes
#undef Radix
#undef Rem
#undef Num
#undef Count
#undef nBits
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.section .text.put64,"ax",@progbits
#define A0 18
#define A1 A0 + 1
#define A2 A0 + 2
#define A3 A0 + 3
#define A4 A0 + 4
#define A5 A0 + 5
#define A6 A0 + 6
#define A7 A0 + 7
#define BUF0 26
#define BUF1 BUF0 + 1
#define Digit 31
#define Carry 31
#define Ten 30
#define Count 29
#define nonZero 28
;;; extern void put64 (uint64_t A, char *buf);
;;; This function writes up to 21 characters (including final '\0') to buf.
DEFUN put64
push r28
push r29
wmov BUF0, 16
ldi Count, 19
ldi nonZero, '0'
.Loop:
;; A[] -= 1.000.000.000.000.000.000 as often as we can.
;; Digit will hold the net number of subtractions (plus '0').
ldi Digit, '0'-1
1:
inc Digit
;; 1.000.000.000.000.000.000 = 0DE0 B6B3 A764 0000
subi A2, 0x64
sbci A3, 0xa7
sbci A4, 0xb3
sbci A5, 0xb6
sbci A6, 0xe0
sbci A7, 0xd
brcc 1b
;; -1.000.000.000.000.000.000 = F21F 494C 589C 0000
;; Undo the underflow from above
subi A2, 0x9c
sbci A3, 0x58
sbci A4, 0x4c
sbci A5, 0x49
sbci A6, 0x1f
sbci A7, 0xf2
;; As ffff ffff ffff ffff = 18.***.***.***.***.***.*** we can get
;; Digit up to 18+'0' in the first round.
cpi Digit, '0'+10
brlo 2f
;; If that is the case, split it into 2 digits: a '1' and the
;; low part in Digit.
ldi nonZero, '1'
st X+, nonZero
subi Digit, 10
2:
;; nonZero "accumulates" digits: it holds a value != '0' if we ever
;; output a digit not equal to '0'.
or nonZero, Digit
;; We only output digits if we saw a digit != '0', i.e. strip leading '0's.
cpi nonZero, '0'
breq 3f
st X+, Digit
3:
;; A[] *= 10
ldi Ten, 10
clr Carry
.Lrot:
mul A0, Ten
add r0, Carry
mov Carry, r1
clr __zero_reg__
adc Carry, __zero_reg__
;; A[] >>>= 8
;; Rotate 8 times so that we can loop the *= 10 over A[].
;; The current value of A0 resides in r0, hence no "mov r0,A0" needed
;; to start the rotation.
mov A0,A1 $ mov A1,A2 $ mov A2,A3 $ mov A3,A4
mov A4,A5 $ mov A5,A6 $ mov A6,A7 $ mov A7,r0
;; Use the upper 3 bits of Count as loop counter for 8 loops.
subi Count, -32
brcs .Lrot
dec Count
brne .Loop
;; If all digits were '0', we had A[] = 0: Output one '0'.
cpi nonZero, '0'
brne 4f
st X+, nonZero
4:
;; String end and epilogue.
st X+, __zero_reg__
pop r29
pop r28
ret
ENDF put64
.section .text.put64s,"ax",@progbits
;;; extern void put64s (int64_t A, char *buf);
;;; Writes up to 21 characters (including '-' and final '\0') to buf.
DEFUN put64s
#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
tst A7
brmi 0f
#else
sbrs A7, 7
#endif
XJMP put64
0:
push r16
push r17
XCALL __negdi2 ; Assumes avr-gcc 4.7+
wmov BUF0, 16
ldi Digit, '-'
st X+, Digit
wmov 16, BUF0
XCALL put64
pop r17
pop r16
ret
ENDF put64s
#undef A0
#undef A1
#undef A2
#undef A3
#undef A4
#undef A5
#undef A6
#undef A7
#undef Digit
#undef BUF0
#undef Ten
#undef Count
#undef Carry
#undef nonZero
- [avr-libc-dev] Printing octal (brain dump), George Spelvin, 2016/12/14
- [avr-libc-dev] Working octal code (FYI), George Spelvin, 2016/12/16
- Re: [avr-libc-dev] Printing octal (brain dump), George Spelvin, 2016/12/19
- Re: [avr-libc-dev] Printing octal (brain dump),
Georg-Johann Lay <=
- Re: [avr-libc-dev] Printing octal (brain dump), Georg-Johann Lay, 2016/12/21
- Re: [avr-libc-dev] Printing octal (brain dump), George Spelvin, 2016/12/21
- [avr-libc-dev] Even faster decimal code, George Spelvin, 2016/12/23
- Re: [avr-libc-dev] Even faster decimal code, Georg-Johann Lay, 2016/12/24
- Re: [avr-libc-dev] Even faster decimal code, George Spelvin, 2016/12/24
- Re: [avr-libc-dev] Even faster decimal code, Martin McKee, 2016/12/28