8000 zstd: Remove offset from bitReader (#854) · klauspost/compress@0836a1c · GitHub
[go: up one dir, main page]

Skip to content

Commit 0836a1c

Browse files
authored
zstd: Remove offset from bitReader (#854)
We can reslice instead of maintaining a separate offset. This gets rid of some bounds checks. Also some other micro-optimizations to bit reading code. Combined results: │ zstd/old │ zstd/new │ │ B/s │ B/s vs base │ Decoder_DecoderSmall/kppkn.gtb.zst/buffered-8 427.6Mi ± 0% 428.2Mi ± 0% +0.13% (p=0.019 n=10) Decoder_DecoderSmall/kppkn.gtb.zst/unbuffered-8 511.6Mi ± 3% 516.9Mi ± 3% ~ (p=0.280 n=10) Decoder_DecoderSmall/geo.protodata.zst/buffered-8 1.110Gi ± 0% 1.110Gi ± 0% ~ (p=0.165 n=10) Decoder_DecoderSmall/geo.protodata.zst/unbuffered-8 824.7Mi ± 2% 827.3Mi ± 2% ~ (p=0.481 n=10) Decoder_DecoderSmall/plrabn12.txt.zst/buffered-8 330.4Mi ± 0% 330.3Mi ± 1% ~ (p=0.645 n=10) Decoder_DecoderSmall/plrabn12.txt.zst/unbuffered-8 533.3Mi ± 4% 538.8Mi ± 5% ~ (p=0.393 n=10) Decoder_DecoderSmall/lcet10.txt.zst/buffered-8 395.0Mi ± 0% 394.6Mi ± 0% -0.10% (p=0.034 n=10) Decoder_DecoderSmall/lcet10.txt.zst/unbuffered-8 556.5Mi ± 6% 546.2Mi ± 8% ~ (p=0.436 n=10) Decoder_DecoderSmall/asyoulik.txt.zst/buffered-8 342.2Mi ± 0% 342.2Mi ± 0% ~ (p=0.956 n=10) Decoder_DecoderSmall/asyoulik.txt.zst/unbuffered-8 436.7Mi ± 2% 435.4Mi ± 3% ~ (p=0.739 n=10) Decoder_DecoderSmall/alice29.txt.zst/buffered-8 335.6Mi ± 2% 337.0Mi ± 0% +0.43% (p=0.000 n=10) Decoder_DecoderSmall/alice29.txt.zst/unbuffered-8 552.6Mi ± 3% 550.7Mi ± 4% ~ (p=1.000 n=10) Decoder_DecoderSmall/html_x_4.zst/buffered-8 2.264Gi ± 0% 2.271Gi ± 0% +0.29% (p=0.035 n=10) Decoder_DecoderSmall/html_x_4.zst/unbuffered-8 1.558Gi ± 4% 1.554Gi ± 3% ~ (p=0.579 n=10) Decoder_DecoderSmall/paper-100k.pdf.zst/buffered-8 3.554Gi ± 5% 3.610Gi ± 0% +1.59% (p=0.000 n=10) Decoder_DecoderSmall/paper-100k.pdf.zst/unbuffered-8 1.701Gi ± 8% 1.709Gi ± 5% ~ (p=0.631 n=10) Decoder_DecoderSmall/fireworks.jpeg.zst/buffered-8 7.891Gi ± 4% 8.070Gi ± 0% +2.26% (p=0.000 n=10) Decoder_DecoderSmall/fireworks.jpeg.zst/unbuffered-8 3.062Gi ± 4% 3.129Gi ± 2% +2.16% (p=0.002 n=10) Decoder_DecoderSmall/urls.10K.zst/buffered-8 525.4Mi ± 6% 553.8Mi ± 0% +5.39% (p=0.000 n=10) Decoder_DecoderSmall/urls.10K.zst/unbuffered-8 763.7Mi ± 6% 819.7Mi ± 2% +7.34% (p=0.000 n=10) Decoder_DecoderSmall/html.zst/buffered-8 894.8Mi ± 0% 898.8Mi ± 2% +0.45% (p=0.043 n=10) Decoder_DecoderSmall/html.zst/unbuffered-8 722.3Mi ± 2% 717.7Mi ± 2% ~ (p=0.912 n=10) Decoder_DecoderSmall/comp-data.bin.zst/buffered-8 386.6Mi ± 2% 390.4Mi ± 0% +1.00% (p=0.000 n=10) Decoder_DecoderSmall/comp-data.bin.zst/unbuffered-8 145.2Mi ± 2% 148.7Mi ± 1% +2.42% (p=0.003 n=10) geomean 770.3Mi 777.5Mi +0.93%
1 parent 11b5108 commit 0836a1c

File tree

5 files changed

+88
-97
lines changed

5 files changed

+88
-97
lines changed

zstd/_generate/gen.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,8 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
156156
brPointer := GP64()
157157
Load(br.Field("value"), brValue)
158158
Load(br.Field("bitsRead"), brBitsRead)
159-
Load(br.Field("off"), brOffset)
160159
Load(br.Field("in").Base(), brPointer)
160+
Load(br.Field("in").Len(), brOffset)
161161
ADDQ(brOffset, brPointer) // Add current offset to read pointer.
162162
MOVQ(brPointer, brPointerStash)
163163
}
@@ -438,7 +438,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
438438
br := Dereference(Param("br"))
439439
Store(brValue, br.Field("value"))
440440
Store(brBitsRead.As8(), br.Field("bitsRead"))
441-
Store(brOffset, br.Field("off"))
441+
Store(brOffset, br.Field("in").Len())
442442

443443
if !o.useSeqs {
444444
Comment("Update the context")

zstd/bitreader.go

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
// for aligning the input.
1818
type bitReader struct {
1919
in []byte
20-
off uint // next byte to read is at in[off - 1]
2120
value uint64 // Maybe use [16]byte, but shifting is awkward.
2221
bitsRead uint8
2322
}
@@ -28,7 +27,6 @@ func (b *bitReader) init(in []byte) error {
2827
return errors.New("corrupt stream: too short")
2928
}
3029
b.in = in
31-
b.off = uint(len(in))
3230
// The highest bit of the last byte indicates where to start
3331
v := in[len(in)-1]
3432
if v == 0 {
@@ -69,47 +67,45 @@ func (b *bitReader) fillFast() {
6967
if b.bitsRead < 32 {
7068
return
7169
}
72-
// 2 bounds checks.
73-
v := b.in[b.off-4:]
74-
v = v[:4]
70+
v := b.in[len(b.in)-4:]
71+
b.in = b.in[:len(b.in)-4]
7572
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
7673
b.value = (b.value << 32) | uint64(low)
7774
b.bitsRead -= 32
78-
b.off -= 4
7975
}
8076

8177
// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
8278
func (b *bitReader) fillFastStart() {
83-
// Do single re-slice to avoid bounds checks.
84-
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
79+
v := b.in[len(b.in)-8:]
80+
b.in = b.in[:len(b.in)-8]
81+
b.value = binary.LittleEndian.Uint64(v)
8582
b.bitsRead = 0
86-
b.off -= 8
8783
}
8884

8985
// fill() will make sure at least 32 bits are available.
9086
func (b *bitReader) fill() {
9187
if b.bitsRead < 32 {
9288
return
9389
}
94-
if b.off >= 4 {
95-
v := b.in[b.off-4:]
96-
v = v[:4]
90+
if len(b.in) >= 4 {
91+
v := b.in[len(b.in)-4:]
92+
b.in = b.in[:len(b.in)-4]
9793
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
9894
b.value = (b.value << 32) | uint64(low)
9995
b.bitsRead -= 32
100-
b.off -= 4
10196
return
10297
}
103-
for b.off > 0 {
104-
b.value = (b.value << 8) | uint64(b.in[b.off-1])
105-
b.bitsRead -= 8
106-
b.off--
98+
99+
b.bitsRead -= uint8(8 * len(b.in))
100+
for len(b.in) > 0 {
101+
b.value = (b.value << 8) | uint64(b.in[len(b.in)-1])
102+
b.in = b.in[:len(b.in)-1]
107103
}
108104
}
109105

110106
// finished returns true if all bits have been read from the bit stream.
111107
func (b *bitReader) finished() bool {
112-
return b.off == 0 && b.bitsRead >= 64
108+
return len(b.in) == 0 && b.bitsRead >= 64
113109
}
114110

115111
// overread returns true if more bits have been requested than is on the stream.
@@ -119,7 +115,7 @@ func (b *bitReader) overread() bool {
119115

120116
// remain returns the number of bits remaining.
121117
func (b *bitReader) remain() uint {
122-
return b.off*8 + 64 - uint(b.bitsRead)
118+
return 8*uint(len(b.in)) + 64 - uint(b.bitsRead)
123119
}
124120

125121
// close the bitstream and returns an error if out-of-buffer reads occurred.

zstd/seqdec.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
245245
return io.ErrUnexpectedEOF
246246
}
247247
var ll, mo, ml int
248-
if br.off > 4+((maxOffsetBits+16+16)>>3) {
248+
if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
249249
// inlined function:
250250
// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
251251

@@ -452,18 +452,13 @@ func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol)
452452

453453
// extra bits are stored in reverse order.
454454
br.fill()
455-
if s.maxBits <= 32 {
456-
mo += br.getBits(moB)
457-
ml += br.getBits(mlB)
458-
ll += br.getBits(llB)
459-
} else {
460-
mo += br.getBits(moB)
455+
mo += br.getBits(moB)
456+
if s.maxBits > 32 {
461457
br.fill()
462-
// matchlength+literal length, max 32 bits
463-
ml += br.getBits(mlB)
464-
ll += br.getBits(llB)
465-
466458
}
459+
// matchlength+literal length, max 32 bits
460+
ml += br.getBits(mlB)
461+
ll += br.getBits(llB)
467462
mo = s.adjustOffset(mo, ll, moB)
468463
return
469464
}

zstd/seqdec_amd64.s

Lines changed: 64 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
66
// Requires: CMOV
77
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
8-
MOVQ br+8(FP), AX
9-
MOVQ 32(AX), DX
10-
MOVBQZX 40(AX), BX
11-
MOVQ 24(AX), SI
12-
MOVQ (AX), AX
8+
MOVQ br+8(FP), CX
9+
MOVQ 24(CX), DX
10+
MOVBQZX 32(CX), BX
11+
MOVQ (CX), AX
12+
MOVQ 8(CX), SI
1313
ADDQ SI, AX
1414
MOVQ AX, (SP)
1515
MOVQ ctx+16(FP), AX
@@ -301,9 +301,9 @@ sequenceDecs_decode_amd64_match_len_ofs_ok:
301301
MOVQ R12, 152(AX)
302302
MOVQ R13, 160(AX)
303303
MOVQ br+8(FP), AX
304-
MOVQ DX, 32(AX)
305-
MOVB BL, 40(AX)
306-
MOVQ SI, 24(AX)
304+
MOVQ DX, 24(AX)
305+
MOVB BL, 32(AX)
306+
MOVQ SI, 8(AX)
307307

308308
// Return success
309309
MOVQ $0x00000000, ret+24(FP)
@@ -336,11 +336,11 @@ error_overread:
336336
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
337337
// Requires: CMOV
338338
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
339-
MOVQ br+8(FP), AX
340-
MOVQ 32(AX), DX
341-
MOVBQZX 40(AX), BX
342-
MOVQ 24(AX), SI
343-
MOVQ (AX), AX
339+
MOVQ br+8(FP), CX
340+
MOVQ 24(CX), DX
341+
MOVBQZX 32(CX), BX
342+
MOVQ (CX), AX
343+
MOVQ 8(CX), SI
344344
ADDQ SI, AX
345345
MOVQ AX, (SP)
346346
MOVQ ctx+16(FP), AX
@@ -603,9 +603,9 @@ sequenceDecs_decode_56_amd64_match_len_ofs_ok:
603603
MOVQ R12, 152(AX)
604604
MOVQ R13, 160(AX)
605605
MOVQ br+8(FP), AX
606-
MOVQ DX, 32(AX)
607-
MOVB BL, 40(AX)
608-
MOVQ SI, 24(AX)
606+
MOVQ DX, 24(AX)
607+
MOVB BL, 32(AX)
608+
MOVQ SI, 8(AX)
609609

610610
// Return success
611611
MOVQ $0x00000000, ret+24(FP)
@@ -638,11 +638,11 @@ error_overread:
638638
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
639639
// Requires: BMI, BMI2, CMOV
640640
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
641-
MOVQ br+8(FP), CX
642-
MOVQ 32(CX), AX
643-
MOVBQZX 40(CX), DX
644-
MOVQ 24(CX), BX
645-
MOVQ (CX), CX
641+
MOVQ br+8(FP), BX
642+
MOVQ 24(BX), AX
643+
MOVBQZX 32(BX), DX
644+
MOVQ (BX), CX
645+
MOVQ 8(BX), BX
646646
ADDQ BX, CX
647647
MOVQ CX, (SP)
648648
MOVQ ctx+16(FP), CX
@@ -892,9 +892,9 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok:
892892
MOVQ R11, 152(CX)
893893
MOVQ R12, 160(CX)
894894
MOVQ br+8(FP), CX
895-
MOVQ AX, 32(CX)
896-
MOVB DL, 40(CX)
897-
MOVQ BX, 24(CX)
895+
MOVQ AX, 24(CX)
896+
MOVB DL, 32(CX)
897+
MOVQ BX, 8(CX)
898898

899899
// Return success
900900
MOVQ $0x00000000, ret+24(FP)
@@ -927,11 +927,11 @@ error_overread:
927927
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
928928
// Requires: BMI, BMI2, CMOV
929929
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
930-
MOVQ br+8(FP), CX
931-
MOVQ 32(CX), AX
932-
MOVBQZX 40(CX), DX
933-
MOVQ 24(CX), BX
934-
MOVQ (CX), CX
930+
MOVQ br+8(FP), BX
931+
MOVQ 24(BX), AX
932+
MOVBQZX 32(BX), DX
933+
MOVQ (BX), CX
934+
MOVQ 8(BX), BX
935935
ADDQ BX, CX
936936
MOVQ CX, (SP)
937937
MOVQ ctx+16(FP), CX
@@ -1152,9 +1152,9 @@ sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
11521152
MOVQ R11, 152(CX)
11531153
MOVQ R12, 160(CX)
11541154
MOVQ br+8(FP), CX
1155-
MOVQ AX, 32(CX)
1156-
MOVB DL, 40(CX)
1157-
MOVQ BX, 24(CX)
1155+
MOVQ AX, 24(CX)
1156+
MOVB DL, 32(CX)
1157+
MOVQ BX, 8(CX)
11581158

11591159
// Return success
11601160
MOVQ $0x00000000, ret+24(FP)
@@ -1797,11 +1797,11 @@ empty_seqs:
17971797
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
17981798
// Requires: CMOV, SSE
17991799
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
1800-
MOVQ br+8(FP), AX
1801-
MOVQ 32(AX), DX
1802-
MOVBQZX 40(AX), BX
1803-
MOVQ 24(AX), SI
1804-
MOVQ (AX), AX
1800+
MOVQ br+8(FP), CX
1801+
MOVQ 24(CX), DX
1802+
MOVBQZX 32(CX), BX
1803+
MOVQ (CX), AX
1804+
MOVQ 8(CX), SI
18051805
ADDQ SI, AX
18061806
MOVQ AX, (SP)
18071807
MOVQ ctx+16(FP), AX
@@ -2295,9 +2295,9 @@ handle_loop:
22952295

22962296
loop_finished:
22972297
MOVQ br+8(FP), AX
2298-
MOVQ DX, 32(AX)
2299-
MOVB BL, 40(AX)
2300-
MOVQ SI, 24(AX)
2298+
MOVQ DX, 24(AX)
2299+
MOVB BL, 32(AX)
2300+
MOVQ SI, 8(AX)
23012301

23022302
// Update the context
23032303
MOVQ ctx+16(FP), AX
@@ -2362,11 +2362,11 @@ error_not_enough_space:
23622362
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
23632363
// Requires: BMI, BMI2, CMOV, SSE
23642364
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
2365-
MOVQ br+8(FP), CX
2366-
MOVQ 32(CX), AX
2367-
MOVBQZX 40(CX), DX
2368-
MOVQ 24(CX), BX
2369-
MOVQ (CX), CX
2365+
MOVQ br+8(FP), BX
2366+
MOVQ 24(BX), AX
2367+
MOVBQZX 32(BX), DX
2368+
MOVQ (BX), CX
2369+
MOVQ 8(BX), BX
23702370
ADDQ BX, CX
23712371
MOVQ CX, (SP)
23722372
MOVQ ctx+16(FP), CX
@@ -2818,9 +2818,9 @@ handle_loop:
28182818

28192819
loop_finished:
28202820
MOVQ br+8(FP), CX
2821-
MOVQ AX, 32(CX)
2822-
MOVB DL, 40(CX)
2823-
MOVQ BX, 24(CX)
2821+
MOVQ AX, 24(CX)
2822+
MOVB DL, 32(CX)
2823+
MOVQ BX, 8(CX)
28242824

28252825
// Update the context
28262826
MOVQ ctx+16(FP), AX
@@ -2885,11 +2885,11 @@ error_not_enough_space:
28852885
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
28862886
// Requires: CMOV, SSE
28872887
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
2888-
MOVQ br+8(FP), AX
2889-
MOVQ 32(AX), DX
2890-
MOVBQZX 40(AX), BX
2891-
MOVQ 24(AX), SI
2892-
MOVQ (AX), AX
2888+
MOVQ br+8(FP), CX
2889+
MOVQ 24(CX), DX
2890+
MOVBQZX 32(CX), BX
2891+
MOVQ (CX), AX
2892+
MOVQ 8(CX), SI
28932893
ADDQ SI, AX
28942894
MOVQ AX, (SP)
28952895
MOVQ ctx+16(FP), AX
@@ -3485,9 +3485,9 @@ handle_loop:
34853485

34863486
loop_finished:
34873487
MOVQ br+8(FP), AX
3488-
MOVQ DX, 32(AX)
3489-
MOVB BL, 40(AX)
3490-
MOVQ SI, 24(AX)
3488+
MOVQ DX, 24(AX)
3489+
MOVB BL, 32(AX)
3490+
MOVQ SI, 8(AX)
34913491

34923492
// Update the context
34933493
MOVQ ctx+16(FP), AX
@@ -3552,11 +3552,11 @@ error_not_enough_space:
35523552
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
35533553
// Requires: BMI, BMI2, CMOV, SSE
35543554
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
3555-
MOVQ br+8(FP), CX
3556-
MOVQ 32(CX), AX
3557-
MOVBQZX 40(CX), DX
3558-
MOVQ 24(CX), BX
3559-
MOVQ (CX), CX
3555+
MOVQ br+8(FP), BX
3556+
MOVQ 24(BX), AX
3557+
MOVBQZX 32(BX), DX
3558+
MOVQ (BX), CX
3559+
MOVQ 8(BX), BX
35603560
ADDQ BX, CX
35613561
MOVQ CX, (SP)
35623562
MOVQ ctx+16(FP), CX
@@ -4110,9 +4110,9 @@ handle_loop:
41104110

41114111
loop_finished:
41124112
MOVQ br+8(FP), CX
4113-
MOVQ AX, 32(CX)
4114-
MOVB DL, 40(CX)
4115-
MOVQ BX, 24(CX)
4113+
MOVQ AX, 24(CX)
4114+
MOVB DL, 32(CX)
4115+
MOVQ BX, 8(CX)
41164116

41174117
// Update the context
41184118
MOVQ ctx+16(FP), AX

zstd/seqdec_generic.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
2929
}
3030
for i := range seqs {
3131
var ll, mo, ml int
32-
if br.off > 4+((maxOffsetBits+16+16)>>3) {
32+
if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
3333
// inlined function:
3434
// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
3535

0 commit comments

Comments
 (0)
0