diff --git a/vendor.mod b/vendor.mod
index b9a1ad8b27..e81c425e17 100644
--- a/vendor.mod
+++ b/vendor.mod
@@ -46,7 +46,7 @@ require (
github.com/hashicorp/serf v0.8.5
github.com/imdario/mergo v0.3.12
github.com/ishidawataru/sctp v0.0.0-20210707070123-9a39160e9062
- github.com/klauspost/compress v1.15.1
+ github.com/klauspost/compress v1.15.9
github.com/miekg/dns v1.1.27
github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible
github.com/moby/buildkit v0.10.4
diff --git a/vendor.sum b/vendor.sum
index cbdc0ef153..9631edc612 100644
--- a/vendor.sum
+++ b/vendor.sum
@@ -692,8 +692,9 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.11.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
-github.com/klauspost/compress v1.15.1 h1:y9FcTHGyrebwfP0ZZqFiaxTaiDnUrGkJkI+f583BL1A=
github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
+github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
+github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
diff --git a/vendor/github.com/klauspost/compress/.gitignore b/vendor/github.com/klauspost/compress/.gitignore
index b35f8449bf..d31b378152 100644
--- a/vendor/github.com/klauspost/compress/.gitignore
+++ b/vendor/github.com/klauspost/compress/.gitignore
@@ -23,3 +23,10 @@ _testmain.go
*.test
*.prof
/s2/cmd/_s2sx/sfx-exe
+
+# Linux perf files
+perf.data
+perf.data.old
+
+# gdb history
+.gdb_history
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 0e2dc116ad..ad5c63a82a 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,72 @@ This package provides various compression algorithms.
# changelog
+* July 13, 2022 (v1.15.8)
+
+ * gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
+ * s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
+ * zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
+ * zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
+ * huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
+ * zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
+ * gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
+
+* June 29, 2022 (v1.15.7)
+
+ * s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633
+ * zip: Merge upstream https://github.com/klauspost/compress/pull/631
+ * zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
+ * zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
+ * flate: Faster histograms https://github.com/klauspost/compress/pull/620
+ * deflate: Use compound hcode https://github.com/klauspost/compress/pull/622
+
+* June 3, 2022 (v1.15.6)
+ * s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
+ * s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
+ * zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
+ * zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
+ * zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
+ * gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
+ * s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
+ * s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
+ * snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
+ * s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
+
+* May 25, 2022 (v1.15.5)
+ * s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
+ * s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
+ * huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596
+ * zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588
+ * zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592
+ * zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
+ * zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
+ * huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
+ * flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590
+
+
+* May 11, 2022 (v1.15.4)
+ * huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)
+ * inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)
+ * zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)
+ * zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)
+
+* May 5, 2022 (v1.15.3)
+ * zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
+ * s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
+
+* Apr 26, 2022 (v1.15.2)
+ * zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster. [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
+ * zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
+ * s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
+ * Minimum version is Go 1.16, added CI test on 1.18.
+
+* Mar 11, 2022 (v1.15.1)
+ * huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
+ * zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
+ * zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
+ * zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
+ * zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
+
* Mar 3, 2022 (v1.15.0)
* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
@@ -60,6 +126,9 @@ While the release has been extensively tested, it is recommended to testing when
* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+
+ See changes to v1.13.x
+
* Aug 30, 2021 (v1.13.5)
* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
@@ -88,6 +157,8 @@ While the release has been extensively tested, it is recommended to testing when
* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
+
+
See changes to v1.12.x
diff --git a/vendor/github.com/klauspost/compress/huff0/autogen.go b/vendor/github.com/klauspost/compress/huff0/autogen.go
deleted file mode 100644
index ff2c69d60c..0000000000
--- a/vendor/github.com/klauspost/compress/huff0/autogen.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package huff0
-
-//go:generate go run generate.go
-//go:generate asmfmt -w decompress_amd64.s
-//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 451160edda..504a7be9da 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}
-// peekTopBits(n) is equvialent to peekBitFast(64 - n)
-func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
- return uint16(b.value >> n)
-}
-
func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
@@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
}
}
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderShifted) finished() bool {
- return b.off == 0 && b.bitsRead >= 64
-}
-
func (b *bitReaderShifted) remaining() uint {
return b.off*8 + uint(64-b.bitsRead)
}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index 6bce4e87d4..ec71f7a349 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -5,8 +5,6 @@
package huff0
-import "fmt"
-
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF} /* up to 16 bits */
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
- b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
- b.nBits += bits
-}
-
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits
}
-// addBits16ZeroNC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-// This is fastest if bits can be zero.
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
- if bits == 0 {
- return
- }
- value <<= (16 - bits) & 15
- value >>= (16 - bits) & 15
- b.bitContainer |= uint64(value) << (b.nBits & 63)
- b.nBits += bits
-}
-
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
- v := b.nBits >> 3
- switch v {
- case 0:
- return
- case 1:
- b.out = append(b.out,
- byte(b.bitContainer),
- )
- b.bitContainer >>= 1 << 3
- case 2:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- )
- b.bitContainer >>= 2 << 3
- case 3:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- )
- b.bitContainer >>= 3 << 3
- case 4:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- )
- b.bitContainer >>= 4 << 3
- case 5:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- )
- b.bitContainer >>= 5 << 3
- case 6:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- )
- b.bitContainer >>= 6 << 3
- case 7:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- )
- b.bitContainer >>= 7 << 3
- case 8:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- byte(b.bitContainer>>56),
- )
- b.bitContainer = 0
- b.nBits = 0
- return
- default:
- panic(fmt.Errorf("bits (%d) > 64", b.nBits))
- }
- b.nBits &= 7
-}
-
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
@@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
b.flushAlign()
return nil
}
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
- b.bitContainer = 0
- b.nBits = 0
- b.out = out
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
index 50bcdf6ea9..4dcab8d232 100644
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go
@@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
b.off = 0
}
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
- b.off += int(n)
-}
-
// Int32 returns a little endian int32 starting at current offset.
func (b byteReader) Int32() int32 {
v3 := int32(b.b[b.off+3])
@@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
}
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
- return b.b[b.off:]
-}
-
// remain will return the number of bytes remaining.
func (b byteReader) remain() int {
return len(b.b) - b.off
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index bc95ac623b..4d14542fac 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
return true
}
+//lint:ignore U1000 used for debugging
func (s *Scratch) validateTable(c cTable) bool {
if len(c) < int(s.symbolLen) {
return false
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 04f6529955..c0c48bd707 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -11,7 +11,6 @@ import (
type dTable struct {
single []dEntrySingle
- double []dEntryDouble
}
// single-symbols decoding
@@ -19,13 +18,6 @@ type dEntrySingle struct {
entry uint16
}
-// double-symbols decoding
-type dEntryDouble struct {
- seq [4]byte
- nBits uint8
- len uint8
-}
-
// Uses special code for all tables that are < 8 bits.
const use8BitTables = true
@@ -35,7 +27,7 @@ const use8BitTables = true
// If no Scratch is provided a new one is allocated.
// The returned Scratch can be used for encoding or decoding input using this table.
func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
- s, err = s.prepare(in)
+ s, err = s.prepare(nil)
if err != nil {
return s, nil, err
}
@@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
return &[4][256]byte{}
}
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
- if len(d.dt.single) == 0 {
- return nil, errors.New("no table loaded")
- }
- if use8BitTables && d.actualTableLog <= 8 {
- return d.decompress1X8Bit(dst, src)
- }
- var br bitReaderShifted
- err := br.init(src)
- if err != nil {
- return dst, err
- }
- maxDecodedSize := cap(dst)
- dst = dst[:0]
-
- // Avoid bounds check by always having full sized table.
- const tlSize = 1 << tableLogMax
- const tlMask = tlSize - 1
- dt := d.dt.single[:tlSize]
-
- // Use temp table to avoid bound checks/append penalty.
- bufs := d.buffer()
- buf := &bufs[0]
- var off uint8
-
- for br.off >= 8 {
- br.fillFast()
- v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+0] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+1] = uint8(v.entry >> 8)
-
- // Refill
- br.fillFast()
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+2] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+3] = uint8(v.entry >> 8)
-
- off += 4
- if off == 0 {
- if len(dst)+256 > maxDecodedSize {
- br.close()
- d.bufs.Put(bufs)
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:]...)
- }
- }
-
- if len(dst)+int(off) > maxDecodedSize {
- d.bufs.Put(bufs)
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:off]...)
-
- // br < 8, so uint8 is fine
- bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
- for bitsLeft > 0 {
- br.fill()
- if false && br.bitsRead >= 32 {
- if br.off >= 4 {
- v := br.in[br.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- br.value = (br.value << 32) | uint64(low)
- br.bitsRead -= 32
- br.off -= 4
- } else {
- for br.off > 0 {
- br.value = (br.value << 8) | uint64(br.in[br.off-1])
- br.bitsRead -= 8
- br.off--
- }
- }
- }
- if len(dst) >= maxDecodedSize {
- d.bufs.Put(bufs)
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
- nBits := uint8(v.entry)
- br.advance(nBits)
- bitsLeft -= nBits
- dst = append(dst, uint8(v.entry>>8))
- }
- d.bufs.Put(bufs)
- return dst, br.close()
-}
-
// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
@@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
const shift = 56
const tlSize = 1 << 8
- const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
deleted file mode 100644
index 0d6cb1a962..0000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
+++ /dev/null
@@ -1,488 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-
- // const stream = 0
- // br0.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
- MOVQ bitReaderShifted_value(br0), br_value
- MOVQ bitReaderShifted_off(br0), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill0
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br0), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br0.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill0:
-
- // val0 := br0.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 0(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br0.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 0+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
- MOVQ br_value, bitReaderShifted_value(br0)
- MOVQ br_offset, bitReaderShifted_off(br0)
-
- // const stream = 1
- // br1.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
- MOVQ bitReaderShifted_value(br1), br_value
- MOVQ bitReaderShifted_off(br1), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill1
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br1), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br1.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill1:
-
- // val0 := br1.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 256(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br1.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 256+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
- MOVQ br_value, bitReaderShifted_value(br1)
- MOVQ br_offset, bitReaderShifted_off(br1)
-
- // const stream = 2
- // br2.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
- MOVQ bitReaderShifted_value(br2), br_value
- MOVQ bitReaderShifted_off(br2), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill2
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br2), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br2.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill2:
-
- // val0 := br2.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 512(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br2.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 512+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
- MOVQ br_value, bitReaderShifted_value(br2)
- MOVQ br_offset, bitReaderShifted_off(br2)
-
- // const stream = 3
- // br3.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
- MOVQ bitReaderShifted_value(br3), br_value
- MOVQ bitReaderShifted_off(br3), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill3
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br3), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br3.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
-
- // }
-skip_fill3:
-
- // val0 := br3.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 768(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br3.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, 768+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
- MOVQ br_value, bitReaderShifted_value(br3)
- MOVQ br_offset, bitReaderShifted_off(br3)
-
- ADDQ $4, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
deleted file mode 100644
index 6d477a2c11..0000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
+++ /dev/null
@@ -1,197 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
- // const stream = {{ var "id" }}
- // br{{ var "id"}}.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
- MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
- MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
-
- // if b.bitsRead >= 32 {
- CMPQ br_bits_read, $32
- JB skip_fill{{ var "id" }}
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
- MOVQ br_bits_read, CX
- SHLQ CL, AX
- ORQ AX, br_value
-
- // exhausted = exhausted || (br{{ var "id"}}.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
- // }
-skip_fill{{ var "id" }}:
-
- // val0 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val1 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
-
- // SECOND PART:
- // val2 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v2 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
- // val3 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
- // v3 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
- MOVBQZX AL, CX
- SHLQ CX, br_value // value <<= n
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off+2] = uint8(v2.entry >> 8)
- // buf[stream][off+3] = uint8(v3.entry >> 8)
- MOVW BX, {{ var "bufofs" }}+2(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
- MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
- MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
- {{ set "id" "0" }}
- {{ set "ofs" "0" }}
- {{ set "bufofs" "0" }} {{/* id * bufoff */}}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "1" }}
- {{ set "ofs" "8" }}
- {{ set "bufofs" "256" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "2" }}
- {{ set "ofs" "16" }}
- {{ set "bufofs" "512" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "3" }}
- {{ set "ofs" "24" }}
- {{ set "bufofs" "768" }}
- {{ template "decode_2_values_x86" . }}
-
- ADDQ $4, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index d47f6644f3..9f3e9f79e2 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -2,30 +2,40 @@
// +build amd64,!appengine,!noasm,gc
// This file contains the specialisation of Decoder.Decompress4X
-// that uses an asm implementation of its main loop.
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
package huff0
import (
"errors"
"fmt"
+
+ "github.com/klauspost/compress/internal/cpuinfo"
)
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
-// go:noescape
-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
- peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+//go:noescape
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
-// go:noescape
-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
- peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+//go:noescape
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
// fallback8BitSize is the size where using Go version is faster.
const fallback8BitSize = 800
+type decompress4xContext struct {
+ pbr *[4]bitReaderShifted
+ peekBits uint8
+ out *byte
+ dstEvery int
+ tbl *dEntrySingle
+ decoded int
+ limit *byte
+}
+
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
@@ -42,6 +52,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if cap(dst) < fallback8BitSize && use8BitTables {
return d.decompress4X8bit(dst, src)
}
+
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
@@ -71,70 +82,25 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
- // Use temp table to avoid bound checks/append penalty.
- buf := d.buffer()
- var off uint8
var decoded int
- const debug = false
-
- // see: bitReaderShifted.peekBitsFast()
- peekBits := uint8((64 - d.actualTableLog) & 63)
-
- // Decode 2 values from each decoder/loop.
- const bufoff = 256
- for {
- if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
- break
+ if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+ ctx := decompress4xContext{
+ pbr: &br,
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ out: &out[0],
+ dstEvery: dstEvery,
+ tbl: &single[0],
+ limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
}
-
if use8BitTables {
- off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+ decompress4x_8b_main_loop_amd64(&ctx)
} else {
- off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
- }
- if debug {
- fmt.Print("DEBUG: ")
- fmt.Printf("off=%d,", off)
- for i := 0; i < 4; i++ {
- fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
- i, br[i].bitsRead, br[i].value, br[i].off)
- }
- fmt.Println("")
+ decompress4x_main_loop_amd64(&ctx)
}
- if off != 0 {
- break
- }
-
- if bufoff > dstEvery {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 1")
- }
- copy(out, buf[0][:])
- copy(out[dstEvery:], buf[1][:])
- copy(out[dstEvery*2:], buf[2][:])
- copy(out[dstEvery*3:], buf[3][:])
- out = out[bufoff:]
- decoded += bufoff * 4
- // There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 2")
- }
- }
- if off > 0 {
- ioff := int(off)
- if len(out) < dstEvery*3+ioff {
- d.bufs.Put(buf)
- return nil, errors.New("corruption detected: stream overrun 3")
- }
- copy(out, buf[0][:off])
- copy(out[dstEvery:], buf[1][:off])
- copy(out[dstEvery*2:], buf[2][:off])
- copy(out[dstEvery*3:], buf[3][:off])
- decoded += int(off) * 4
- out = out[off:]
+ decoded = ctx.decoded
+ out = out[decoded/4:]
}
// Decode remaining.
@@ -150,7 +116,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
- d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
@@ -164,7 +129,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
offset++
}
if offset != endsAt {
- d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
@@ -173,9 +137,86 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
return nil, err
}
}
- d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+
+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+
+type decompress1xContext struct {
+ pbr *bitReaderShifted
+ peekBits uint8
+ out *byte
+ outCap int
+ tbl *dEntrySingle
+ decoded int
+}
+
+// Error reported by asm implementations
+const error_max_decoded_size_exeeded = -1
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:maxDecodedSize]
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+
+ if maxDecodedSize >= 4 {
+ ctx := decompress1xContext{
+ pbr: &br,
+ out: &dst[0],
+ outCap: maxDecodedSize,
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ tbl: &d.dt.single[0],
+ }
+
+ if cpuinfo.HasBMI2() {
+ decompress1x_main_loop_bmi2(&ctx)
+ } else {
+ decompress1x_main_loop_amd64(&ctx)
+ }
+ if ctx.decoded == error_max_decoded_size_exeeded {
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+
+ dst = dst[:ctx.decoded]
+ }
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if len(dst) >= maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index 2edad3ea5a..dd1a5aecd6 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,506 +1,847 @@
-// +build !appengine
-// +build gc
-// +build !noasm
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
+ XORQ DX, DX
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
+ // Preload values
+ MOVQ ctx+0(FP), AX
+ MOVBQZX 8(AX), DI
+ MOVQ 16(AX), SI
+ MOVQ 48(AX), BX
+ MOVQ 24(AX), R9
+ MOVQ 32(AX), R10
+ MOVQ (AX), R11
+ // Main loop
main_loop:
+ MOVQ SI, R8
+ CMPQ R8, BX
+ SETGE DL
- // const stream = 0
- // br0.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
- MOVQ bitReaderShifted_value(br0), br_value
- MOVQ bitReaderShifted_off(br0), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill0
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br0), AX
+ // br0.fillFast32()
+ MOVQ 32(R11), R12
+ MOVBQZX 40(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill0
+ MOVQ 24(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ (R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 24(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br0.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill0:
-
// val0 := br0.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ MOVW (R10)(R14*2), CX
- // br0.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br0.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R10)(R14*2), CX
// br0.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 0(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
- MOVQ br_value, bitReaderShifted_value(br0)
- MOVQ br_offset, bitReaderShifted_off(br0)
+ // update the bitreader structure
+ MOVQ R12, 32(R11)
+ MOVB R13, 40(R11)
+ ADDQ R9, R8
- // const stream = 1
- // br1.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
- MOVQ bitReaderShifted_value(br1), br_value
- MOVQ bitReaderShifted_off(br1), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill1
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br1), AX
+ // br1.fillFast32()
+ MOVQ 80(R11), R12
+ MOVBQZX 88(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill1
+ MOVQ 72(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ 48(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 72(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br1.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill1:
-
// val0 := br1.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ MOVW (R10)(R14*2), CX
- // br1.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br1.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R10)(R14*2), CX
// br1.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 256(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
- MOVQ br_value, bitReaderShifted_value(br1)
- MOVQ br_offset, bitReaderShifted_off(br1)
+ // update the bitreader structure
+ MOVQ R12, 80(R11)
+ MOVB R13, 88(R11)
+ ADDQ R9, R8
- // const stream = 2
- // br2.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
- MOVQ bitReaderShifted_value(br2), br_value
- MOVQ bitReaderShifted_off(br2), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill2
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br2), AX
+ // br2.fillFast32()
+ MOVQ 128(R11), R12
+ MOVBQZX 136(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill2
+ MOVQ 120(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ 96(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 120(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br2.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill2:
-
// val0 := br2.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ MOVW (R10)(R14*2), CX
- // br2.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br2.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R10)(R14*2), CX
// br2.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 512(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
- MOVQ br_value, bitReaderShifted_value(br2)
- MOVQ br_offset, bitReaderShifted_off(br2)
+ // update the bitreader structure
+ MOVQ R12, 128(R11)
+ MOVB R13, 136(R11)
+ ADDQ R9, R8
- // const stream = 3
- // br3.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
- MOVQ bitReaderShifted_value(br3), br_value
- MOVQ bitReaderShifted_off(br3), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill3
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br3), AX
+ // br3.fillFast32()
+ MOVQ 176(R11), R12
+ MOVBQZX 184(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill3
+ MOVQ 168(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ 144(R11), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-
-#endif
-
- ORQ AX, br_value
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 168(R11)
+ ORQ R14, R12
// exhausted = exhausted || (br3.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
- // }
skip_fill3:
-
// val0 := br3.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
// v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
+ MOVW (R10)(R14*2), CX
- // br3.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-
-#else
// val1 := br3.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-
-#endif
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
// v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
+ MOVW (R10)(R14*2), CX
// br3.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
// these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, 768(buffer)(off*1)
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
- MOVQ br_value, bitReaderShifted_value(br3)
- MOVQ br_offset, bitReaderShifted_off(br3)
-
- ADDQ $2, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
+ // update the bitreader structure
+ MOVQ R12, 176(R11)
+ MOVB R13, 184(R11)
+ ADDQ $0x02, SI
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), SI
+ SHLQ $0x02, SI
+ MOVQ SI, 40(AX)
RET
-#undef off
-#undef buffer
-#undef table
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
+ XORQ DX, DX
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
+ // Preload values
+ MOVQ ctx+0(FP), CX
+ MOVBQZX 8(CX), DI
+ MOVQ 16(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 24(CX), R9
+ MOVQ 32(CX), R10
+ MOVQ (CX), R11
-#undef br0
-#undef br1
-#undef br2
-#undef br3
+ // Main loop
+main_loop:
+ MOVQ BX, R8
+ CMPQ R8, SI
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R11), R12
+ MOVBQZX 40(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill0
+ MOVQ 24(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ (R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 24(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br0.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill0:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 32(R11)
+ MOVB R13, 40(R11)
+ ADDQ R9, R8
+
+ // br1.fillFast32()
+ MOVQ 80(R11), R12
+ MOVBQZX 88(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill1
+ MOVQ 72(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 48(R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 72(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br1.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 80(R11)
+ MOVB R13, 88(R11)
+ ADDQ R9, R8
+
+ // br2.fillFast32()
+ MOVQ 128(R11), R12
+ MOVBQZX 136(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill2
+ MOVQ 120(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 96(R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 120(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br2.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill2:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 128(R11)
+ MOVB R13, 136(R11)
+ ADDQ R9, R8
+
+ // br3.fillFast32()
+ MOVQ 176(R11), R12
+ MOVBQZX 184(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill3
+ MOVQ 168(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 144(R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 168(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br3.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill3:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 176(R11)
+ MOVB R13, 184(R11)
+ ADDQ $0x04, BX
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), BX
+ SHLQ $0x02, BX
+ MOVQ BX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_1_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_2_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_1_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_2_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
deleted file mode 100644
index 330d86ae15..0000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
+++ /dev/null
@@ -1,195 +0,0 @@
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-#include "funcdata.h"
-#include "go_asm.h"
-
-#ifdef GOAMD64_v4
-#ifndef GOAMD64_v3
-#define GOAMD64_v3
-#endif
-#endif
-
-#define bufoff 256 // see decompress.go, we're using [4][256]byte table
-
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
-// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
-#define off R8
-#define buffer DI
-#define table SI
-
-#define br_bits_read R9
-#define br_value R10
-#define br_offset R11
-#define peek_bits R12
-#define exhausted DX
-
-#define br0 R13
-#define br1 R14
-#define br2 R15
-#define br3 BP
-
- MOVQ BP, 0(SP)
-
- XORQ exhausted, exhausted // exhausted = false
- XORQ off, off // off = 0
-
- MOVBQZX peekBits+32(FP), peek_bits
- MOVQ buf+40(FP), buffer
- MOVQ tbl+48(FP), table
-
- MOVQ pbr0+0(FP), br0
- MOVQ pbr1+8(FP), br1
- MOVQ pbr2+16(FP), br2
- MOVQ pbr3+24(FP), br3
-
-main_loop:
-{{ define "decode_2_values_x86" }}
- // const stream = {{ var "id" }}
- // br{{ var "id"}}.fillFast()
- MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
- MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
- MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
-
- // We must have at least 2 * max tablelog left
- CMPQ br_bits_read, $64-22
- JBE skip_fill{{ var "id" }}
-
- SUBQ $32, br_bits_read // b.bitsRead -= 32
- SUBQ $4, br_offset // b.off -= 4
-
- // v := b.in[b.off-4 : b.off]
- // v = v[:4]
- // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
-
- // b.value |= uint64(low) << (b.bitsRead & 63)
-#ifdef GOAMD64_v3
- SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
-#else
- MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
- MOVQ br_bits_read, CX
- SHLQ CL, AX
-#endif
-
- ORQ AX, br_value
-
- // exhausted = exhausted || (br{{ var "id"}}.off < 4)
- CMPQ br_offset, $4
- SETLT DL
- ORB DL, DH
- // }
-skip_fill{{ var "id" }}:
-
- // val0 := br{{ var "id"}}.peekTopBits(peekBits)
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
- MOVQ br_value, AX
- MOVQ peek_bits, CX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-#endif
-
- // v0 := table[val0&mask]
- MOVW 0(table)(AX*2), AX // AX - v0
-
- // br{{ var "id"}}.advance(uint8(v0.entry))
- MOVB AH, BL // BL = uint8(v0.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-
-#ifdef GOAMD64_v3
- SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
-#else
- // val1 := br{{ var "id"}}.peekTopBits(peekBits)
- MOVQ peek_bits, CX
- MOVQ br_value, AX
- SHRQ CL, AX // AX = (value >> peek_bits) & mask
-#endif
-
- // v1 := table[val1&mask]
- MOVW 0(table)(AX*2), AX // AX - v1
-
- // br{{ var "id"}}.advance(uint8(v1.entry))
- MOVB AH, BH // BH = uint8(v1.entry >> 8)
-
-#ifdef GOAMD64_v3
- MOVBQZX AL, CX
- SHLXQ AX, br_value, br_value // value <<= n
-#else
- MOVBQZX AL, CX
- SHLQ CL, br_value // value <<= n
-#endif
-
- ADDQ CX, br_bits_read // bits_read += n
-
-
- // these two writes get coalesced
- // buf[stream][off] = uint8(v0.entry >> 8)
- // buf[stream][off+1] = uint8(v1.entry >> 8)
- MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
-
- // update the bitrader reader structure
- MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
- MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
- MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
-{{ end }}
-
- {{ set "id" "0" }}
- {{ set "ofs" "0" }}
- {{ set "bufofs" "0" }} {{/* id * bufoff */}}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "1" }}
- {{ set "ofs" "8" }}
- {{ set "bufofs" "256" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "2" }}
- {{ set "ofs" "16" }}
- {{ set "bufofs" "512" }}
- {{ template "decode_2_values_x86" . }}
-
- {{ set "id" "3" }}
- {{ set "ofs" "24" }}
- {{ set "bufofs" "768" }}
- {{ template "decode_2_values_x86" . }}
-
- ADDQ $2, off // off += 2
-
- TESTB DH, DH // any br[i].ofs < 4?
- JNZ end
-
- CMPQ off, $bufoff
- JL main_loop
-end:
- MOVQ 0(SP), BP
-
- MOVB off, ret+56(FP)
- RET
-#undef off
-#undef buffer
-#undef table
-
-#undef br_bits_read
-#undef br_value
-#undef br_offset
-#undef peek_bits
-#undef exhausted
-
-#undef br0
-#undef br1
-#undef br2
-#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
index 126b4d68a9..4f6f37cb2c 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -191,3 +191,105 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
}
return dst, nil
}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress1X8Bit(dst, src)
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:0]
+
+ // Avoid bounds check by always having full sized table.
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ dt := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ bufs := d.buffer()
+ buf := &bufs[0]
+ var off uint8
+
+ for br.off >= 8 {
+ br.fillFast()
+ v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ // Refill
+ br.fillFast()
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ d.bufs.Put(bufs)
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+
+ if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:off]...)
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if false && br.bitsRead >= 32 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value = (br.value << 32) | uint64(low)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value = (br.value << 8) | uint64(br.in[br.off-1])
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ if len(dst) >= maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ d.bufs.Put(bufs)
+ return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
new file mode 100644
index 0000000000..3954c51219
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
@@ -0,0 +1,34 @@
+// Package cpuinfo gives runtime info about the current CPU.
+//
+// This is a very limited module meant for use internally
+// in this project. For more versatile solution check
+// https://github.com/klauspost/cpuid.
+package cpuinfo
+
+// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
+func HasBMI1() bool {
+ return hasBMI1
+}
+
+// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
+func HasBMI2() bool {
+ return hasBMI2
+}
+
+// DisableBMI2 will disable BMI2, for testing purposes.
+// Call returned function to restore previous state.
+func DisableBMI2() func() {
+ old := hasBMI2
+ hasBMI2 = false
+ return func() {
+ hasBMI2 = old
+ }
+}
+
+// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
+func HasBMI() bool {
+ return HasBMI1() && HasBMI2()
+}
+
+var hasBMI1 bool
+var hasBMI2 bool
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
new file mode 100644
index 0000000000..e802579c4f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
@@ -0,0 +1,11 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package cpuinfo
+
+// go:noescape
+func x86extensions() (bmi1, bmi2 bool)
+
+func init() {
+ hasBMI1, hasBMI2 = x86extensions()
+}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
new file mode 100644
index 0000000000..4465fbe9e9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
@@ -0,0 +1,36 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+TEXT ·x86extensions(SB), NOSPLIT, $0
+ // 1. determine max EAX value
+ XORQ AX, AX
+ CPUID
+
+ CMPQ AX, $7
+ JB unsupported
+
+ // 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
+ MOVQ $7, AX
+ MOVQ $0, CX
+ CPUID
+
+ BTQ $3, BX // bit 3 = BMI1
+ SETCS AL
+
+ BTQ $8, BX // bit 8 = BMI2
+ SETCS AH
+
+ MOVB AL, bmi1+0(FP)
+ MOVB AH, bmi2+1(FP)
+ RET
+
+unsupported:
+ XORQ AX, AX
+ MOVB AL, bmi1+0(FP)
+ MOVB AL, bmi2+1(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index e3445ac194..beb7fa8720 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -386,47 +386,31 @@ In practice this means that concurrency is often limited to utilizing about 3 co
### Benchmarks
-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
-
The first two are streaming decodes and the last are smaller inputs.
-
+
+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
+
```
-BenchmarkDecoderSilesia-8 3 385000067 ns/op 550.51 MB/s 5498 B/op 8 allocs/op
-BenchmarkDecoderSilesiaCgo-8 6 197666567 ns/op 1072.25 MB/s 270672 B/op 8 allocs/op
+BenchmarkDecoderSilesia-32 5 206878840 ns/op 1024.50 MB/s 49808 B/op 43 allocs/op
+BenchmarkDecoderEnwik9-32 1 1271809000 ns/op 786.28 MB/s 72048 B/op 52 allocs/op
-BenchmarkDecoderEnwik9-8 1 2027001600 ns/op 493.34 MB/s 10496 B/op 18 allocs/op
-BenchmarkDecoderEnwik9Cgo-8 2 979499200 ns/op 1020.93 MB/s 270672 B/op 8 allocs/op
+Concurrent blocks, performance:
-Concurrent performance:
-
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16 28915 42469 ns/op 4340.07 MB/s 114 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16 116505 9965 ns/op 11900.16 MB/s 16 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16 8952 134272 ns/op 3588.70 MB/s 915 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16 11820 102538 ns/op 4161.90 MB/s 594 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16 34782 34184 ns/op 3661.88 MB/s 60 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16 27712 43447 ns/op 3500.58 MB/s 99 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16 62826 18750 ns/op 21845.10 MB/s 104 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16 631545 1794 ns/op 57078.74 MB/s 2 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16 1690140 712 ns/op 172938.13 MB/s 1 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16 10432 113593 ns/op 6180.73 MB/s 1143 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html.zst-16 113206 10671 ns/op 9596.27 MB/s 15 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16 1530615 779 ns/op 5229.49 MB/s 0 B/op 0 allocs/op
-
-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16 65217 16192 ns/op 11383.34 MB/s 46 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16 292671 4039 ns/op 29363.19 MB/s 6 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16 26314 46021 ns/op 10470.43 MB/s 293 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16 33897 34900 ns/op 12227.96 MB/s 205 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16 104348 11433 ns/op 10949.01 MB/s 20 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16 75949 15510 ns/op 9805.60 MB/s 32 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16 173910 6756 ns/op 60624.29 MB/s 37 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16 923076 1339 ns/op 76474.87 MB/s 1 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16 922920 1351 ns/op 91102.57 MB/s 2 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16 27649 43618 ns/op 16096.19 MB/s 407 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16 279073 4160 ns/op 24614.18 MB/s 6 B/op 0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16 749938 1579 ns/op 2581.71 MB/s 0 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32 67356 17857 ns/op 10321.96 MB/s 22.48 pct 102 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32 266656 4421 ns/op 26823.21 MB/s 11.89 pct 19 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32 20992 56842 ns/op 8477.17 MB/s 39.90 pct 754 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32 27456 43932 ns/op 9714.01 MB/s 33.27 pct 524 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32 78432 15047 ns/op 8319.15 MB/s 40.34 pct 66 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32 65800 18436 ns/op 8249.63 MB/s 37.75 pct 88 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32 102993 11523 ns/op 35546.09 MB/s 3.637 pct 143 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32 1000000 1070 ns/op 95720.98 MB/s 80.53 pct 3 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32 749802 1752 ns/op 70272.35 MB/s 100.0 pct 5 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32 22640 52934 ns/op 13263.37 MB/s 26.25 pct 1014 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-32 226412 5232 ns/op 19572.27 MB/s 14.49 pct 20 B/op 0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32 923041 1276 ns/op 3194.71 MB/s 31.26 pct 0 B/op 0 allocs/op
```
-This reflects the performance around May 2020, but this may be out of date.
+This reflects the performance around May 2022, but this may be out of date.
## Zstd inside ZIP files
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index d7cd15ba29..97299d499c 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -63,13 +63,6 @@ func (b *bitReader) get32BitsFast(n uint8) uint32 {
return v
}
-func (b *bitReader) get16BitsFast(n uint8) uint16 {
- const regMask = 64 - 1
- v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
- b.bitsRead += n
- return v
-}
-
// fillFast() will make sure at least 32 bits are available.
// There must be at least 4 bytes available.
func (b *bitReader) fillFast() {
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
index b366182850..78b3c61be3 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -5,8 +5,6 @@
package zstd
-import "fmt"
-
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@@ -73,80 +71,6 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
b.nBits += bits
}
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
- v := b.nBits >> 3
- switch v {
- case 0:
- case 1:
- b.out = append(b.out,
- byte(b.bitContainer),
- )
- case 2:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- )
- case 3:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- )
- case 4:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- )
- case 5:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- )
- case 6:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- )
- case 7:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- )
- case 8:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- byte(b.bitContainer>>56),
- )
- default:
- panic(fmt.Errorf("bits (%d) > 64", b.nBits))
- }
- b.bitContainer >>= v << 3
- b.nBits &= 7
-}
-
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 7d567a54a0..7eed729be2 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,9 +5,14 @@
package zstd
import (
+ "bytes"
+ "encoding/binary"
"errors"
"fmt"
"io"
+ "io/ioutil"
+ "os"
+ "path/filepath"
"sync"
"github.com/klauspost/compress/huff0"
@@ -38,14 +43,14 @@ const (
// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
maxCompressedBlockSize = 128 << 10
+ compressedBlockOverAlloc = 16
+ maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
+
// Maximum possible block size (all Raw+Uncompressed).
maxBlockSize = (1 << 21) - 1
- // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
- maxCompressedLiteralSize = 1 << 18
- maxRLELiteralSize = 1 << 20
- maxMatchLen = 131074
- maxSequences = 0x7f00 + 0xffff
+ maxMatchLen = 131074
+ maxSequences = 0x7f00 + 0xffff
// We support slightly less than the reference decoder to be able to
// use ints on 32 bit archs.
@@ -97,7 +102,6 @@ type blockDec struct {
// Block is RLE, this is the size.
RLESize uint32
- tmp [4]byte
Type blockType
@@ -136,7 +140,7 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
b.Type = blockType((bh >> 1) & 3)
// find size.
cSize := int(bh >> 3)
- maxSize := maxBlockSize
+ maxSize := maxCompressedBlockSizeAlloc
switch b.Type {
case blockTypeReserved:
return ErrReservedBlockType
@@ -157,9 +161,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
println("Data size on stream:", cSize)
}
b.RLESize = 0
- maxSize = maxCompressedBlockSize
+ maxSize = maxCompressedBlockSizeAlloc
if windowSize < maxCompressedBlockSize && b.lowMem {
- maxSize = int(windowSize)
+ maxSize = int(windowSize) + compressedBlockOverAlloc
}
if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
if debugDecoder {
@@ -190,9 +194,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
// Read block data.
if cap(b.dataStorage) < cSize {
if b.lowMem || cSize > maxCompressedBlockSize {
- b.dataStorage = make([]byte, 0, cSize)
+ b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
} else {
- b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
+ b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
}
}
if cap(b.dst) <= maxSize {
@@ -360,14 +364,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
}
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, litRegenSize)
+ b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
} else {
- if litRegenSize > maxCompressedLiteralSize {
- // Exceptional
- b.literalBuf = make([]byte, litRegenSize)
- } else {
- b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
- }
+ b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
literals = b.literalBuf[:litRegenSize]
@@ -397,14 +396,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
// Ensure we have space to store it.
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, 0, litRegenSize)
+ b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
} else {
- b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+ b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
var err error
// Use our out buffer.
- huff.MaxDecodedSize = maxCompressedBlockSize
+ huff.MaxDecodedSize = litRegenSize
if fourStreams {
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
} else {
@@ -429,9 +428,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
// Ensure we have space to store it.
if cap(b.literalBuf) < litRegenSize {
if b.lowMem {
- b.literalBuf = make([]byte, 0, litRegenSize)
+ b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
} else {
- b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
+ b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
}
}
huff := hist.huffTree
@@ -448,7 +447,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
return in, err
}
hist.huffTree = huff
- huff.MaxDecodedSize = maxCompressedBlockSize
+ huff.MaxDecodedSize = litRegenSize
// Use our out buffer.
if fourStreams {
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -463,6 +462,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
if len(literals) != litRegenSize {
return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
}
+ // Re-cap to get extra size.
+ literals = b.literalBuf[:len(literals)]
if debugDecoder {
printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
}
@@ -486,10 +487,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
b.dst = append(b.dst, hist.decoders.literals...)
return nil
}
- err = hist.decoders.decodeSync(hist)
+ before := len(hist.decoders.out)
+ err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
if err != nil {
return err
}
+ if hist.decoders.maxSyncLen > 0 {
+ hist.decoders.maxSyncLen += uint64(before)
+ hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
+ }
b.dst = hist.decoders.out
hist.recentOffsets = hist.decoders.prevOffset
return nil
@@ -632,6 +638,22 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
println("initializing sequences:", err)
return err
}
+ // Extract blocks...
+ if false && hist.dict == nil {
+ fatalErr := func(err error) {
+ if err != nil {
+ panic(err)
+ }
+ }
+ fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
+ var buf bytes.Buffer
+ fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
+ fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
+ fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
+ buf.Write(in)
+ ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
+ }
+
return nil
}
@@ -650,6 +672,7 @@ func (b *blockDec) decodeSequences(hist *history) error {
}
hist.decoders.windowSize = hist.windowSize
hist.decoders.prevOffset = hist.recentOffsets
+
err := hist.decoders.decode(b.sequence)
hist.recentOffsets = hist.decoders.prevOffset
return err
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index b80191e4b1..2ad02070d7 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -23,7 +23,7 @@ type byteBuffer interface {
readByte() (byte, error)
// Skip n bytes.
- skipN(n int) error
+ skipN(n int64) error
}
// in-memory buffer
@@ -52,10 +52,6 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
return r, nil
}
-func (b *byteBuf) remain() []byte {
- return *b
-}
-
func (b *byteBuf) readByte() (byte, error) {
bb := *b
if len(bb) < 1 {
@@ -66,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) {
return r, nil
}
-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
bb := *b
- if len(bb) < n {
+ if n < 0 {
+ return fmt.Errorf("negative skip (%d) requested", n)
+ }
+ if int64(len(bb)) < n {
return io.ErrUnexpectedEOF
}
*b = bb[n:]
@@ -124,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) {
return r.tmp[0], nil
}
-func (r *readerWrapper) skipN(n int) error {
- n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
- if n2 != int64(n) {
+func (r *readerWrapper) skipN(n int64) error {
+ n2, err := io.CopyN(ioutil.Discard, r.r, n)
+ if n2 != n {
err = io.ErrUnexpectedEOF
}
return err
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index 2c4fca17fa..0e59a242d8 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -13,12 +13,6 @@ type byteReader struct {
off int
}
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
- b.b = in
- b.off = 0
-}
-
// advance the stream b n bytes.
func (b *byteReader) advance(n uint) {
b.off += int(n)
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 9fcdaac1dc..d212f4737f 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -347,18 +347,23 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
frame.history.setDict(&dict)
}
-
- if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
- return dst, ErrDecoderSizeExceeded
+ if frame.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
+ }
+ return dst, ErrWindowSizeExceeded
}
- if frame.FrameContentSize < 1<<30 {
- // Never preallocate more than 1 GB up front.
+ if frame.FrameContentSize != fcsUnknown {
+ if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+ return dst, ErrDecoderSizeExceeded
+ }
if cap(dst)-len(dst) < int(frame.FrameContentSize) {
- dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+ dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
copy(dst2, dst)
dst = dst2
}
}
+
if cap(dst) == 0 {
// Allocate len(input) * 2 by default if nothing is provided
// and we didn't get frame content size.
@@ -437,7 +442,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
}
- if len(next.b) > 0 {
+ if !d.o.ignoreChecksum && len(next.b) > 0 {
n, err := d.current.crc.Write(next.b)
if err == nil {
if n != len(next.b) {
@@ -449,7 +454,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
got := d.current.crc.Sum64()
var tmp [4]byte
binary.LittleEndian.PutUint32(tmp[:], uint32(got))
- if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
+ if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
if debugDecoder {
println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
}
@@ -533,9 +538,15 @@ func (d *Decoder) nextBlockSync() (ok bool) {
// Update/Check CRC
if d.frame.HasCheckSum {
- d.frame.crc.Write(d.current.b)
+ if !d.o.ignoreChecksum {
+ d.frame.crc.Write(d.current.b)
+ }
if d.current.d.Last {
- d.current.err = d.frame.checkCRC()
+ if !d.o.ignoreChecksum {
+ d.current.err = d.frame.checkCRC()
+ } else {
+ d.current.err = d.frame.consumeCRC()
+ }
if d.current.err != nil {
println("CRC error:", d.current.err)
return false
@@ -629,60 +640,18 @@ func (d *Decoder) startSyncDecoder(r io.Reader) error {
// Create Decoder:
// ASYNC:
-// Spawn 4 go routines.
-// 0: Read frames and decode blocks.
-// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
-// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
-// 3: Wait for stream history, execute sequences, send stream history.
+// Spawn 3 go routines.
+// 0: Read frames and decode block literals.
+// 1: Decode sequences.
+// 2: Execute sequences, send to output.
func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
defer d.streamWg.Done()
br := readerWrapper{r: r}
- var seqPrepare = make(chan *blockDec, d.o.concurrent)
var seqDecode = make(chan *blockDec, d.o.concurrent)
var seqExecute = make(chan *blockDec, d.o.concurrent)
- // Async 1: Prepare blocks...
- go func() {
- var hist history
- var hasErr bool
- for block := range seqPrepare {
- if hasErr {
- if block != nil {
- seqDecode <- block
- }
- continue
- }
- if block.async.newHist != nil {
- if debugDecoder {
- println("Async 1: new history")
- }
- hist.reset()
- if block.async.newHist.dict != nil {
- hist.setDict(block.async.newHist.dict)
- }
- }
- if block.err != nil || block.Type != blockTypeCompressed {
- hasErr = block.err != nil
- seqDecode <- block
- continue
- }
-
- remain, err := block.decodeLiterals(block.data, &hist)
- block.err = err
- hasErr = block.err != nil
- if err == nil {
- block.async.literals = hist.decoders.literals
- block.async.seqData = remain
- } else if debugDecoder {
- println("decodeLiterals error:", err)
- }
- seqDecode <- block
- }
- close(seqDecode)
- }()
-
- // Async 2: Decode sequences...
+ // Async 1: Decode sequences...
go func() {
var hist history
var hasErr bool
@@ -696,7 +665,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
}
if block.async.newHist != nil {
if debugDecoder {
- println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
+ println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
}
hist.decoders = block.async.newHist.decoders
hist.recentOffsets = block.async.newHist.recentOffsets
@@ -750,7 +719,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
}
if block.async.newHist != nil {
if debugDecoder {
- println("Async 3: new history")
+ println("Async 2: new history")
}
hist.windowSize = block.async.newHist.windowSize
hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
@@ -837,6 +806,33 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
decodeStream:
for {
+ var hist history
+ var hasErr bool
+
+ decodeBlock := func(block *blockDec) {
+ if hasErr {
+ if block != nil {
+ seqDecode <- block
+ }
+ return
+ }
+ if block.err != nil || block.Type != blockTypeCompressed {
+ hasErr = block.err != nil
+ seqDecode <- block
+ return
+ }
+
+ remain, err := block.decodeLiterals(block.data, &hist)
+ block.err = err
+ hasErr = block.err != nil
+ if err == nil {
+ block.async.literals = hist.decoders.literals
+ block.async.seqData = remain
+ } else if debugDecoder {
+ println("decodeLiterals error:", err)
+ }
+ seqDecode <- block
+ }
frame := d.frame
if debugDecoder {
println("New frame...")
@@ -863,7 +859,7 @@ decodeStream:
case <-ctx.Done():
case dec := <-d.decoders:
dec.sendErr(err)
- seqPrepare <- dec
+ decodeBlock(dec)
}
break decodeStream
}
@@ -883,6 +879,10 @@ decodeStream:
if debugDecoder {
println("Alloc History:", h.allocFrameBuffer)
}
+ hist.reset()
+ if h.dict != nil {
+ hist.setDict(h.dict)
+ }
dec.async.newHist = &h
dec.async.fcs = frame.FrameContentSize
historySent = true
@@ -909,7 +909,7 @@ decodeStream:
}
err = dec.err
last := dec.Last
- seqPrepare <- dec
+ decodeBlock(dec)
if err != nil {
break decodeStream
}
@@ -918,7 +918,7 @@ decodeStream:
}
}
}
- close(seqPrepare)
+ close(seqDecode)
wg.Wait()
d.frame.history.b = frameHistCache
}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index fd05c9bb01..c70e6fa0f7 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -19,6 +19,7 @@ type decoderOptions struct {
maxDecodedSize uint64
maxWindowSize uint64
dicts []dict
+ ignoreChecksum bool
}
func (o *decoderOptions) setDefault() {
@@ -31,7 +32,7 @@ func (o *decoderOptions) setDefault() {
if o.concurrent > 4 {
o.concurrent = 4
}
- o.maxDecodedSize = 1 << 63
+ o.maxDecodedSize = 64 << 30
}
// WithDecoderLowmem will set whether to use a lower amount of memory,
@@ -66,7 +67,7 @@ func WithDecoderConcurrency(n int) DOption {
// WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
// non-streaming operations or maximum window size for streaming operations.
// This can be used to control memory usage of potentially hostile content.
-// Maximum and default is 1 << 63 bytes.
+// Maximum is 1 << 63 bytes. Default is 64GiB.
func WithDecoderMaxMemory(n uint64) DOption {
return func(o *decoderOptions) error {
if n == 0 {
@@ -112,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
return nil
}
}
+
+// IgnoreChecksum allows to forcibly ignore checksum checking.
+func IgnoreChecksum(b bool) DOption {
+ return func(o *decoderOptions) error {
+ o.ignoreChecksum = b
+ return nil
+ }
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index 602c05ee0c..c769f6941d 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -156,8 +156,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -518,8 +518,8 @@ encodeLoop:
}
// Store this, since we have it.
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
@@ -674,8 +674,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -1047,8 +1047,8 @@ encodeLoop:
}
// Store this, since we have it.
- nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+ nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index d6b3104240..7ff0c64fa3 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -127,8 +127,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -439,8 +439,8 @@ encodeLoop:
var t int32
for {
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -785,8 +785,8 @@ encodeLoop:
panic("offset0 was 0")
}
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
@@ -969,7 +969,7 @@ encodeLoop:
te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
- longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
+ longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
e.longTable[longHash1] = te0
e.longTable[longHash2] = te1
e.markLongShardDirty(longHash1)
@@ -1002,8 +1002,8 @@ encodeLoop:
}
// Store this, since we have it.
- nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+ nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index dcc987a7cb..7aaaedb23e 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// If a non-single block is needed the encoder will reset again.
e.encoders <- enc
}()
- // Use single segments when above minimum window and below 1MB.
- single := len(src) < 1<<20 && len(src) > MinWindowSize
+ // Use single segments when above minimum window and below window size.
+ single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
if e.o.single != nil {
single = *e.o.single
}
@@ -551,7 +551,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
}
// If we can do everything in one block, prefer that.
- if len(src) <= maxCompressedBlockSize {
+ if len(src) <= e.o.blockSize {
enc.Reset(e.o.dict, true)
// Slightly faster with no history and everything in one block.
if e.o.crc {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 44d8dbd199..a7c5e1aac4 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
// This setting has no effect on streamed encodes.
func WithSingleSegment(b bool) EOption {
return func(o *encoderOptions) error {
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 11089d2232..9568a4ba31 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
}
n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
println("Skipping frame with", n, "bytes.")
- err = br.skipN(int(n))
+ err = br.skipN(int64(n))
if err != nil {
if debugDecoder {
println("Reading discarded frame", err)
@@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
d.crc.Reset()
}
+ if d.WindowSize > d.o.maxWindowSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrWindowSizeExceeded
+ }
+
if d.WindowSize == 0 && d.SingleSegment {
// We may not need window in this case.
d.WindowSize = d.FrameContentSize
if d.WindowSize < MinWindowSize {
d.WindowSize = MinWindowSize
}
+ if d.WindowSize > d.o.maxDecodedSize {
+ if debugDecoder {
+ printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+ }
+ return ErrDecoderSizeExceeded
+ }
}
- if d.WindowSize > uint64(d.o.maxWindowSize) {
- if debugDecoder {
- printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
- }
- return ErrWindowSizeExceeded
- }
// The minimum Window_Size is 1 KB.
if d.WindowSize < MinWindowSize {
if debugDecoder {
@@ -253,10 +260,11 @@ func (d *frameDec) reset(br byteBuffer) error {
return ErrWindowSizeTooSmall
}
d.history.windowSize = int(d.WindowSize)
- if d.o.lowMem && d.history.windowSize < maxBlockSize {
+ if !d.o.lowMem || d.history.windowSize < maxBlockSize {
+ // Alloc 2x window size if not low-mem, or very small window size.
d.history.allocFrameBuffer = d.history.windowSize * 2
- // TODO: Maybe use FrameContent size
} else {
+ // Alloc with one additional block
d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
}
@@ -290,13 +298,6 @@ func (d *frameDec) checkCRC() error {
if !d.HasCheckSum {
return nil
}
- var tmp [4]byte
- got := d.crc.Sum64()
- // Flip to match file order.
- tmp[0] = byte(got >> 0)
- tmp[1] = byte(got >> 8)
- tmp[2] = byte(got >> 16)
- tmp[3] = byte(got >> 24)
// We can overwrite upper tmp now
want, err := d.rawInput.readSmall(4)
@@ -305,7 +306,19 @@ func (d *frameDec) checkCRC() error {
return err
}
- if !bytes.Equal(tmp[:], want) && !ignoreCRC {
+ if d.o.ignoreChecksum {
+ return nil
+ }
+
+ var tmp [4]byte
+ got := d.crc.Sum64()
+ // Flip to match file order.
+ tmp[0] = byte(got >> 0)
+ tmp[1] = byte(got >> 8)
+ tmp[2] = byte(got >> 16)
+ tmp[3] = byte(got >> 24)
+
+ if !bytes.Equal(tmp[:], want) {
if debugDecoder {
println("CRC Check Failed:", tmp[:], "!=", want)
}
@@ -317,6 +330,19 @@ func (d *frameDec) checkCRC() error {
return nil
}
+// consumeCRC reads the checksum data if the frame has one.
+func (d *frameDec) consumeCRC() error {
+ if d.HasCheckSum {
+ _, err := d.rawInput.readSmall(4)
+ if err != nil {
+ println("CRC missing?", err)
+ return err
+ }
+ }
+
+ return nil
+}
+
// runDecoder will create a sync decoder that will decode a block of data.
func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
saved := d.history.b
@@ -326,6 +352,19 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
d.history.ignoreBuffer = len(dst)
// Store input length, so we only check new data.
crcStart := len(dst)
+ d.history.decoders.maxSyncLen = 0
+ if d.FrameContentSize != fcsUnknown {
+ d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+ if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+ return dst, ErrDecoderSizeExceeded
+ }
+ if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+ // Alloc for output
+ dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
+ copy(dst2, dst)
+ dst = dst2
+ }
+ }
var err error
for {
err = dec.reset(d.rawInput, d.WindowSize)
@@ -360,13 +399,17 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
err = ErrFrameSizeMismatch
} else if d.HasCheckSum {
- var n int
- n, err = d.crc.Write(dst[crcStart:])
- if err == nil {
- if n != len(dst)-crcStart {
- err = io.ErrShortWrite
- } else {
- err = d.checkCRC()
+ if d.o.ignoreChecksum {
+ err = d.consumeCRC()
+ } else {
+ var n int
+ n, err = d.crc.Write(dst[crcStart:])
+ if err == nil {
+ if n != len(dst)-crcStart {
+ err = io.ErrShortWrite
+ } else {
+ err = d.checkCRC()
+ }
}
}
}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index bb3d4fd6c3..2f8860a722 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -5,8 +5,10 @@
package zstd
import (
+ "encoding/binary"
"errors"
"fmt"
+ "io"
)
const (
@@ -178,10 +180,32 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<> 3)
- // println(s.norm[:s.symbolLen], s.symbolLen)
return s.buildDtable()
}
+func (s *fseDecoder) mustReadFrom(r io.Reader) {
+ fatalErr := func(err error) {
+ if err != nil {
+ panic(err)
+ }
+ }
+ // dt [maxTablesize]decSymbol // Decompression table.
+ // symbolLen uint16 // Length of active part of the symbol table.
+ // actualTableLog uint8 // Selected tablelog.
+ // maxBits uint8 // Maximum number of additional bits
+ // // used for table creation to avoid allocations.
+ // stateTable [256]uint16
+ // norm [maxSymbolValue + 1]int16
+ // preDefined bool
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
+ fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
+}
+
// decSymbol contains information about a state entry,
// Including the state offset base, the output symbol and
// the number of bits to read for the low part of the destination state.
@@ -204,18 +228,10 @@ func (d decSymbol) newState() uint16 {
return uint16(d >> 16)
}
-func (d decSymbol) baseline() uint32 {
- return uint32(d >> 32)
-}
-
func (d decSymbol) baselineInt() int {
return int(d >> 32)
}
-func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
- *d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
-}
-
func (d *decSymbol) setNBits(nBits uint8) {
const mask = 0xffffffffffffff00
*d = (*d & mask) | decSymbol(nBits)
@@ -231,11 +247,6 @@ func (d *decSymbol) setNewState(state uint16) {
*d = (*d & mask) | decSymbol(state)<<16
}
-func (d *decSymbol) setBaseline(baseline uint32) {
- const mask = 0xffffffff
- *d = (*d & mask) | decSymbol(baseline)<<32
-}
-
func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
const mask = 0xffff00ff
*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
@@ -257,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) {
s.dt[0] = symbol
}
-// buildDtable will build the decoding table.
-func (s *fseDecoder) buildDtable() error {
- tableSize := uint32(1 << s.actualTableLog)
- highThreshold := tableSize - 1
- symbolNext := s.stateTable[:256]
-
- // Init, lay down lowprob symbols
- {
- for i, v := range s.norm[:s.symbolLen] {
- if v == -1 {
- s.dt[highThreshold].setAddBits(uint8(i))
- highThreshold--
- symbolNext[i] = 1
- } else {
- symbolNext[i] = uint16(v)
- }
- }
- }
- // Spread symbols
- {
- tableMask := tableSize - 1
- step := tableStep(tableSize)
- position := uint32(0)
- for ss, v := range s.norm[:s.symbolLen] {
- for i := 0; i < int(v); i++ {
- s.dt[position].setAddBits(uint8(ss))
- position = (position + step) & tableMask
- for position > highThreshold {
- // lowprob area
- position = (position + step) & tableMask
- }
- }
- }
- if position != 0 {
- // position must reach all cells once, otherwise normalizedCounter is incorrect
- return errors.New("corrupted input (position != 0)")
- }
- }
-
- // Build Decoding table
- {
- tableSize := uint16(1 << s.actualTableLog)
- for u, v := range s.dt[:tableSize] {
- symbol := v.addBits()
- nextState := symbolNext[symbol]
- symbolNext[symbol] = nextState + 1
- nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
- s.dt[u&maxTableMask].setNBits(nBits)
- newState := (nextState << nBits) - tableSize
- if newState > tableSize {
- return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
- }
- if newState == uint16(u) && nBits == 0 {
- // Seems weird that this is possible with nbits > 0.
- return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
- }
- s.dt[u&maxTableMask].setNewState(newState)
- }
- }
- return nil
-}
-
// transform will transform the decoder table into a table usable for
// decoding without having to apply the transformation while decoding.
// The state will contain the base value and the number of bits to read.
@@ -352,34 +301,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
s.state = dt[br.getBits(tableLog)]
}
-// next returns the current symbol and sets the next state.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) next(br *bitReader) {
- lowBits := uint16(br.getBits(s.state.nbBits()))
- s.state = s.dt[s.state.newState()+lowBits]
-}
-
-// finished returns true if all bits have been read from the bitstream
-// and the next state would require reading bits from the input.
-func (s *fseState) finished(br *bitReader) bool {
- return br.finished() && s.state.nbBits() > 0
-}
-
-// final returns the current state symbol without decoding the next.
-func (s *fseState) final() (int, uint8) {
- return s.state.baselineInt(), s.state.addBits()
-}
-
// final returns the current state symbol without decoding the next.
func (s decSymbol) final() (int, uint8) {
return s.baselineInt(), s.addBits()
}
-
-// nextFast returns the next symbol and sets the next state.
-// This can only be used if no symbols are 0 bits.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
- lowBits := br.get16BitsFast(s.state.nbBits())
- s.state = s.dt[s.state.newState()+lowBits]
- return s.state.baseline(), s.state.addBits()
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
new file mode 100644
index 0000000000..c881d28d88
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -0,0 +1,64 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+ "fmt"
+)
+
+type buildDtableAsmContext struct {
+ // inputs
+ stateTable *uint16
+ norm *int16
+ dt *uint64
+
+ // outputs --- set by the procedure in the case of error;
+ // for interpretation please see the error handling part below
+ errParam1 uint64
+ errParam2 uint64
+}
+
+// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
+// Function returns non-zero exit code on error.
+// go:noescape
+func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+
+// please keep in sync with _generate/gen_fse.go
+const (
+ errorCorruptedNormalizedCounter = 1
+ errorNewStateTooBig = 2
+ errorNewStateNoBits = 3
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+ ctx := buildDtableAsmContext{
+ stateTable: &s.stateTable[0],
+ norm: &s.norm[0],
+ dt: (*uint64)(&s.dt[0]),
+ }
+ code := buildDtable_asm(s, &ctx)
+
+ if code != 0 {
+ switch code {
+ case errorCorruptedNormalizedCounter:
+ position := ctx.errParam1
+ return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
+
+ case errorNewStateTooBig:
+ newState := decSymbol(ctx.errParam1)
+ size := ctx.errParam2
+ return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
+
+ case errorNewStateNoBits:
+ newState := decSymbol(ctx.errParam1)
+ oldState := decSymbol(ctx.errParam2)
+ return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
+
+ default:
+ return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
+ }
+ }
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
new file mode 100644
index 0000000000..da32b4420e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@@ -0,0 +1,127 @@
+// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+TEXT ·buildDtable_asm(SB), $0-24
+ MOVQ ctx+8(FP), CX
+ MOVQ s+0(FP), DI
+
+ // Load values
+ MOVBQZX 4098(DI), DX
+ XORQ AX, AX
+ BTSQ DX, AX
+ MOVQ (CX), BX
+ MOVQ 16(CX), SI
+ LEAQ -1(AX), R8
+ MOVQ 8(CX), CX
+ MOVWQZX 4096(DI), DI
+
+ // End load values
+ // Init, lay down lowprob symbols
+ XORQ R9, R9
+ JMP init_main_loop_condition
+
+init_main_loop:
+ MOVWQSX (CX)(R9*2), R10
+ CMPW R10, $-1
+ JNE do_not_update_high_threshold
+ MOVB R9, 1(SI)(R8*8)
+ DECQ R8
+ MOVQ $0x0000000000000001, R10
+
+do_not_update_high_threshold:
+ MOVW R10, (BX)(R9*2)
+ INCQ R9
+
+init_main_loop_condition:
+ CMPQ R9, DI
+ JL init_main_loop
+
+ // Spread symbols
+ // Calculate table step
+ MOVQ AX, R9
+ SHRQ $0x01, R9
+ MOVQ AX, R10
+ SHRQ $0x03, R10
+ LEAQ 3(R9)(R10*1), R9
+
+ // Fill add bits values
+ LEAQ -1(AX), R10
+ XORQ R11, R11
+ XORQ R12, R12
+ JMP spread_main_loop_condition
+
+spread_main_loop:
+ XORQ R13, R13
+ MOVWQSX (CX)(R12*2), R14
+ JMP spread_inner_loop_condition
+
+spread_inner_loop:
+ MOVB R12, 1(SI)(R11*8)
+
+adjust_position:
+ ADDQ R9, R11
+ ANDQ R10, R11
+ CMPQ R11, R8
+ JG adjust_position
+ INCQ R13
+
+spread_inner_loop_condition:
+ CMPQ R13, R14
+ JL spread_inner_loop
+ INCQ R12
+
+spread_main_loop_condition:
+ CMPQ R12, DI
+ JL spread_main_loop
+ TESTQ R11, R11
+ JZ spread_check_ok
+ MOVQ ctx+8(FP), AX
+ MOVQ R11, 24(AX)
+ MOVQ $+1, ret+16(FP)
+ RET
+
+spread_check_ok:
+ // Build Decoding table
+ XORQ DI, DI
+
+build_table_main_table:
+ MOVBQZX 1(SI)(DI*8), CX
+ MOVWQZX (BX)(CX*2), R8
+ LEAQ 1(R8), R9
+ MOVW R9, (BX)(CX*2)
+ MOVQ R8, R9
+ BSRQ R9, R9
+ MOVQ DX, CX
+ SUBQ R9, CX
+ SHLQ CL, R8
+ SUBQ AX, R8
+ MOVB CL, (SI)(DI*8)
+ MOVW R8, 2(SI)(DI*8)
+ CMPQ R8, AX
+ JLE build_table_check1_ok
+ MOVQ ctx+8(FP), CX
+ MOVQ R8, 24(CX)
+ MOVQ AX, 32(CX)
+ MOVQ $+2, ret+16(FP)
+ RET
+
+build_table_check1_ok:
+ TESTB CL, CL
+ JNZ build_table_check2_ok
+ CMPW R8, DI
+ JNE build_table_check2_ok
+ MOVQ ctx+8(FP), AX
+ MOVQ R8, 24(AX)
+ MOVQ DI, 32(AX)
+ MOVQ $+3, ret+16(FP)
+ RET
+
+build_table_check2_ok:
+ INCQ DI
+ CMPQ DI, AX
+ JL build_table_main_table
+ MOVQ $+0, ret+16(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
new file mode 100644
index 0000000000..332e51fe44
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
@@ -0,0 +1,72 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+ "errors"
+ "fmt"
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+ tableSize := uint32(1 << s.actualTableLog)
+ highThreshold := tableSize - 1
+ symbolNext := s.stateTable[:256]
+
+ // Init, lay down lowprob symbols
+ {
+ for i, v := range s.norm[:s.symbolLen] {
+ if v == -1 {
+ s.dt[highThreshold].setAddBits(uint8(i))
+ highThreshold--
+ symbolNext[i] = 1
+ } else {
+ symbolNext[i] = uint16(v)
+ }
+ }
+ }
+
+ // Spread symbols
+ {
+ tableMask := tableSize - 1
+ step := tableStep(tableSize)
+ position := uint32(0)
+ for ss, v := range s.norm[:s.symbolLen] {
+ for i := 0; i < int(v); i++ {
+ s.dt[position].setAddBits(uint8(ss))
+ position = (position + step) & tableMask
+ for position > highThreshold {
+ // lowprob area
+ position = (position + step) & tableMask
+ }
+ }
+ }
+ if position != 0 {
+ // position must reach all cells once, otherwise normalizedCounter is incorrect
+ return errors.New("corrupted input (position != 0)")
+ }
+ }
+
+ // Build Decoding table
+ {
+ tableSize := uint16(1 << s.actualTableLog)
+ for u, v := range s.dt[:tableSize] {
+ symbol := v.addBits()
+ nextState := symbolNext[symbol]
+ symbolNext[symbol] = nextState + 1
+ nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
+ s.dt[u&maxTableMask].setNBits(nBits)
+ newState := (nextState << nBits) - tableSize
+ if newState > tableSize {
+ return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
+ }
+ if newState == uint16(u) && nBits == 0 {
+ // Seems weird that this is possible with nbits > 0.
+ return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
+ }
+ s.dt[u&maxTableMask].setNewState(newState)
+ }
+ }
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index 5442061b18..ab26326a8f 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -76,21 +76,6 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
s.clearCount = maxCount != 0
}
-// prepare will prepare and allocate scratch tables used for both compression and decompression.
-func (s *fseEncoder) prepare() (*fseEncoder, error) {
- if s == nil {
- s = &fseEncoder{}
- }
- s.useRLE = false
- if s.clearCount && s.maxCount == 0 {
- for i := range s.count {
- s.count[i] = 0
- }
- s.clearCount = false
- }
- return s, nil
-}
-
// allocCtable will allocate tables needed for compression.
// If existing tables a re big enough, they are simply re-used.
func (s *fseEncoder) allocCtable() {
@@ -709,14 +694,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
c.state = c.stateTable[lu]
}
-// encode the output symbol provided and write it to the bitstream.
-func (c *cState) encode(symbolTT symbolTransform) {
- nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
- dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
- c.bw.addBits16NC(c.state, uint8(nbBitsOut))
- c.state = c.stateTable[dstState]
-}
-
// flush will write the tablelog to the output and flush the remaining full bytes.
func (c *cState) flush(tableLog uint8) {
c.bw.flush32()
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz.go b/vendor/github.com/klauspost/compress/zstd/fuzz.go
deleted file mode 100644
index 7f2210e053..0000000000
--- a/vendor/github.com/klauspost/compress/zstd/fuzz.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build ignorecrc
-// +build ignorecrc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
-const ignoreCRC = true
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
deleted file mode 100644
index 6811c68a89..0000000000
--- a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
+++ /dev/null
@@ -1,11 +0,0 @@
-//go:build !ignorecrc
-// +build !ignorecrc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
-const ignoreCRC = false
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
index cf33f29a1b..5d73c21ebd 100644
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -33,9 +33,3 @@ func hashLen(u uint64, length, mls uint8) uint32 {
return (uint32(u) * prime4bytes) >> (32 - length)
}
}
-
-// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash3(u uint32, h uint8) uint32 {
- return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 819f1461b7..df04472030 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -73,6 +73,7 @@ type sequenceDecs struct {
seqSize int
windowSize int
maxBits uint8
+ maxSyncLen uint64
}
// initialize all 3 decoders from the stream input.
@@ -98,153 +99,13 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
return nil
}
-// decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decode(seqs []seqVals) error {
- br := s.br
-
- // Grab full sizes tables, to avoid bounds checks.
- llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
- llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
- s.seqSize = 0
- litRemain := len(s.literals)
- maxBlockSize := maxCompressedBlockSize
- if s.windowSize < maxBlockSize {
- maxBlockSize = s.windowSize
- }
- for i := range seqs {
- var ll, mo, ml int
- if br.off > 4+((maxOffsetBits+16+16)>>3) {
- // inlined function:
- // ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
-
- // Final will not read from stream.
- var llB, mlB, moB uint8
- ll, llB = llState.final()
- ml, mlB = mlState.final()
- mo, moB = ofState.final()
-
- // extra bits are stored in reverse order.
- br.fillFast()
- mo += br.getBits(moB)
- if s.maxBits > 32 {
- br.fillFast()
- }
- ml += br.getBits(mlB)
- ll += br.getBits(llB)
-
- if moB > 1 {
- s.prevOffset[2] = s.prevOffset[1]
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = mo
- } else {
- // mo = s.adjustOffset(mo, ll, moB)
- // Inlined for rather big speedup
- if ll == 0 {
- // There is an exception though, when current sequence's literals_length = 0.
- // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
- // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
- mo++
- }
-
- if mo == 0 {
- mo = s.prevOffset[0]
- } else {
- var temp int
- if mo == 3 {
- temp = s.prevOffset[0] - 1
- } else {
- temp = s.prevOffset[mo]
- }
-
- if temp == 0 {
- // 0 is not valid; input is corrupted; force offset to 1
- println("WARNING: temp was 0")
- temp = 1
- }
-
- if mo != 1 {
- s.prevOffset[2] = s.prevOffset[1]
- }
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = temp
- mo = temp
- }
- }
- br.fillFast()
- } else {
- if br.overread() {
- if debugDecoder {
- printf("reading sequence %d, exceeded available data\n", i)
- }
- return io.ErrUnexpectedEOF
- }
- ll, mo, ml = s.next(br, llState, mlState, ofState)
- br.fill()
- }
-
- if debugSequences {
- println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
- }
- // Evaluate.
- // We might be doing this async, so do it early.
- if mo == 0 && ml > 0 {
- return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
- }
- if ml > maxMatchLen {
- return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
- }
- s.seqSize += ll + ml
- if s.seqSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
- }
- litRemain -= ll
- if litRemain < 0 {
- return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
- }
- seqs[i] = seqVals{
- ll: ll,
- ml: ml,
- mo: mo,
- }
- if i == len(seqs)-1 {
- // This is the last sequence, so we shouldn't update state.
- break
- }
-
- // Manually inlined, ~ 5-20% faster
- // Update all 3 states at once. Approx 20% faster.
- nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
- if nBits == 0 {
- llState = llTable[llState.newState()&maxTableMask]
- mlState = mlTable[mlState.newState()&maxTableMask]
- ofState = ofTable[ofState.newState()&maxTableMask]
- } else {
- bits := br.get32BitsFast(nBits)
- lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
- llState = llTable[(llState.newState()+lowBits)&maxTableMask]
-
- lowBits = uint16(bits >> (ofState.nbBits() & 31))
- lowBits &= bitMask[mlState.nbBits()&15]
- mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
-
- lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
- ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
- }
- }
- s.seqSize += litRemain
- if s.seqSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
- }
- err := br.close()
- if err != nil {
- printf("Closing sequences: %v, %+v\n", err, *br)
- }
- return err
-}
-
// execute will execute the decoded sequence with the provided history.
// The sequence must be evaluated before being sent.
func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
+ if len(s.dict) == 0 {
+ return s.executeSimple(seqs, hist)
+ }
+
// Ensure we have enough output size...
if len(s.out)+s.seqSize > cap(s.out) {
addBytes := s.seqSize + len(s.out)
@@ -327,6 +188,7 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
}
}
}
+
// Add final literals
copy(out[t:], s.literals)
if debugDecoder {
@@ -341,14 +203,18 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
}
// decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decodeSync(history *history) error {
+func (s *sequenceDecs) decodeSync(hist []byte) error {
+ supported, err := s.decodeSyncSimple(hist)
+ if supported {
+ return err
+ }
+
br := s.br
seqs := s.nSeqs
startSize := len(s.out)
// Grab full sizes tables, to avoid bounds checks.
llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
- hist := history.b[history.ignoreBuffer:]
out := s.out
maxBlockSize := maxCompressedBlockSize
if s.windowSize < maxBlockSize {
@@ -433,7 +299,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
}
size := ll + ml + len(out)
if size-startSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
}
if size > cap(out) {
// Not enough size, which can happen under high volume block streaming conditions
@@ -463,13 +329,13 @@ func (s *sequenceDecs) decodeSync(history *history) error {
if mo > len(out)+len(hist) || mo > s.windowSize {
if len(s.dict) == 0 {
- return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
}
// we may be in dictionary.
dictO := len(s.dict) - (mo - (len(out) + len(hist)))
if dictO < 0 || dictO >= len(s.dict) {
- return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
}
end := dictO + ml
if end > len(s.dict) {
@@ -530,6 +396,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
ofState = ofTable[ofState.newState()&maxTableMask]
} else {
bits := br.get32BitsFast(nBits)
+
lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
llState = llTable[(llState.newState()+lowBits)&maxTableMask]
@@ -543,8 +410,8 @@ func (s *sequenceDecs) decodeSync(history *history) error {
}
// Check if space for literals
- if len(s.literals)+len(s.out)-startSize > maxBlockSize {
- return fmt.Errorf("output (%d) bigger than max block size (%d)", len(s.out), maxBlockSize)
+ if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
}
// Add final literals
@@ -552,16 +419,6 @@ func (s *sequenceDecs) decodeSync(history *history) error {
return br.close()
}
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) update(br *bitReader) {
- // Max 8 bits
- s.litLengths.state.next(br)
- // Max 9 bits
- s.matchLengths.state.next(br)
- // Max 8 bits
- s.offsets.state.next(br)
-}
-
var bitMask [16]uint16
func init() {
@@ -570,87 +427,6 @@ func init() {
}
}
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) updateAlt(br *bitReader) {
- // Update all 3 states at once. Approx 20% faster.
- a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-
- nBits := a.nbBits() + b.nbBits() + c.nbBits()
- if nBits == 0 {
- s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
- s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
- s.offsets.state.state = s.offsets.state.dt[c.newState()]
- return
- }
- bits := br.get32BitsFast(nBits)
- lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
- s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
-
- lowBits = uint16(bits >> (c.nbBits() & 31))
- lowBits &= bitMask[b.nbBits()&15]
- s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
-
- lowBits = uint16(bits) & bitMask[c.nbBits()&15]
- s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
-}
-
-// nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
-func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
- // Final will not read from stream.
- ll, llB := llState.final()
- ml, mlB := mlState.final()
- mo, moB := ofState.final()
-
- // extra bits are stored in reverse order.
- br.fillFast()
- mo += br.getBits(moB)
- if s.maxBits > 32 {
- br.fillFast()
- }
- ml += br.getBits(mlB)
- ll += br.getBits(llB)
-
- if moB > 1 {
- s.prevOffset[2] = s.prevOffset[1]
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = mo
- return
- }
- // mo = s.adjustOffset(mo, ll, moB)
- // Inlined for rather big speedup
- if ll == 0 {
- // There is an exception though, when current sequence's literals_length = 0.
- // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
- // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
- mo++
- }
-
- if mo == 0 {
- mo = s.prevOffset[0]
- return
- }
- var temp int
- if mo == 3 {
- temp = s.prevOffset[0] - 1
- } else {
- temp = s.prevOffset[mo]
- }
-
- if temp == 0 {
- // 0 is not valid; input is corrupted; force offset to 1
- println("temp was 0")
- temp = 1
- }
-
- if mo != 1 {
- s.prevOffset[2] = s.prevOffset[1]
- }
- s.prevOffset[1] = s.prevOffset[0]
- s.prevOffset[0] = temp
- mo = temp
- return
-}
-
func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
// Final will not read from stream.
ll, llB := llState.final()
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
new file mode 100644
index 0000000000..7598c1018b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -0,0 +1,368 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+ "fmt"
+
+ "github.com/klauspost/compress/internal/cpuinfo"
+)
+
+type decodeSyncAsmContext struct {
+ llTable []decSymbol
+ mlTable []decSymbol
+ ofTable []decSymbol
+ llState uint64
+ mlState uint64
+ ofState uint64
+ iteration int
+ litRemain int
+ out []byte
+ outPosition int
+ literals []byte
+ litPosition int
+ history []byte
+ windowSize int
+ ll int // set on error (not for all errors, please refer to _generate/gen.go)
+ ml int // set on error (not for all errors, please refer to _generate/gen.go)
+ mo int // set on error (not for all errors, please refer to _generate/gen.go)
+}
+
+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// decode sequences from the stream with the provided history but without a dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+ if len(s.dict) > 0 {
+ return false, nil
+ }
+ if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
+ return false, nil
+ }
+
+ // FIXME: Using unsafe memory copies leads to rare, random crashes
+ // with fuzz testing. It is therefore disabled for now.
+ const useSafe = true
+ /*
+ useSafe := false
+ if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+ useSafe = true
+ }
+ if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+ useSafe = true
+ }
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ useSafe = true
+ }
+ */
+
+ br := s.br
+
+ maxBlockSize := maxCompressedBlockSize
+ if s.windowSize < maxBlockSize {
+ maxBlockSize = s.windowSize
+ }
+
+ ctx := decodeSyncAsmContext{
+ llTable: s.litLengths.fse.dt[:maxTablesize],
+ mlTable: s.matchLengths.fse.dt[:maxTablesize],
+ ofTable: s.offsets.fse.dt[:maxTablesize],
+ llState: uint64(s.litLengths.state.state),
+ mlState: uint64(s.matchLengths.state.state),
+ ofState: uint64(s.offsets.state.state),
+ iteration: s.nSeqs - 1,
+ litRemain: len(s.literals),
+ out: s.out,
+ outPosition: len(s.out),
+ literals: s.literals,
+ windowSize: s.windowSize,
+ history: hist,
+ }
+
+ s.seqSize = 0
+ startSize := len(s.out)
+
+ var errCode int
+ if cpuinfo.HasBMI2() {
+ if useSafe {
+ errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
+ }
+ } else {
+ if useSafe {
+ errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
+ }
+ }
+ switch errCode {
+ case noError:
+ break
+
+ case errorMatchLenOfsMismatch:
+ return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
+
+ case errorMatchLenTooBig:
+ return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
+
+ case errorMatchOffTooBig:
+ return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
+ ctx.mo, ctx.outPosition+len(hist)-startSize)
+
+ case errorNotEnoughLiterals:
+ return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
+ ctx.ll, ctx.litRemain+ctx.ll)
+
+ case errorNotEnoughSpace:
+ size := ctx.outPosition + ctx.ll + ctx.ml
+ if debugDecoder {
+ println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
+ }
+ return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+
+ default:
+ return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
+ }
+
+ s.seqSize += ctx.litRemain
+ if s.seqSize > maxBlockSize {
+ return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ err := br.close()
+ if err != nil {
+ printf("Closing sequences: %v, %+v\n", err, *br)
+ return true, err
+ }
+
+ s.literals = s.literals[ctx.litPosition:]
+ t := ctx.outPosition
+ s.out = s.out[:t]
+
+ // Add final literals
+ s.out = append(s.out, s.literals...)
+ if debugDecoder {
+ t += len(s.literals)
+ if t != len(s.out) {
+ panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
+ }
+ }
+
+ return true, nil
+}
+
+// --------------------------------------------------------------------------------
+
+type decodeAsmContext struct {
+ llTable []decSymbol
+ mlTable []decSymbol
+ ofTable []decSymbol
+ llState uint64
+ mlState uint64
+ ofState uint64
+ iteration int
+ seqs []seqVals
+ litRemain int
+}
+
+const noError = 0
+
+// error reported when mo == 0 && ml > 0
+const errorMatchLenOfsMismatch = 1
+
+// error reported when ml > maxMatchLen
+const errorMatchLenTooBig = 2
+
+// error reported when mo > available history or mo > s.windowSize
+const errorMatchOffTooBig = 3
+
+// error reported when the sum of literal lengths exeeceds the literal buffer size
+const errorNotEnoughLiterals = 4
+
+// error reported when capacity of `out` is too small
+const errorNotEnoughSpace = 5
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+ br := s.br
+
+ maxBlockSize := maxCompressedBlockSize
+ if s.windowSize < maxBlockSize {
+ maxBlockSize = s.windowSize
+ }
+
+ ctx := decodeAsmContext{
+ llTable: s.litLengths.fse.dt[:maxTablesize],
+ mlTable: s.matchLengths.fse.dt[:maxTablesize],
+ ofTable: s.offsets.fse.dt[:maxTablesize],
+ llState: uint64(s.litLengths.state.state),
+ mlState: uint64(s.matchLengths.state.state),
+ ofState: uint64(s.offsets.state.state),
+ seqs: seqs,
+ iteration: len(seqs) - 1,
+ litRemain: len(s.literals),
+ }
+
+ s.seqSize = 0
+ lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
+ var errCode int
+ if cpuinfo.HasBMI2() {
+ if lte56bits {
+ errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
+ }
+ } else {
+ if lte56bits {
+ errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
+ } else {
+ errCode = sequenceDecs_decode_amd64(s, br, &ctx)
+ }
+ }
+ if errCode != 0 {
+ i := len(seqs) - ctx.iteration - 1
+ switch errCode {
+ case errorMatchLenOfsMismatch:
+ ml := ctx.seqs[i].ml
+ return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+
+ case errorMatchLenTooBig:
+ ml := ctx.seqs[i].ml
+ return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+
+ case errorNotEnoughLiterals:
+ ll := ctx.seqs[i].ll
+ return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
+ }
+
+ return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
+ }
+
+ if ctx.litRemain < 0 {
+ return fmt.Errorf("literal count is too big: total available %d, total requested %d",
+ len(s.literals), len(s.literals)-ctx.litRemain)
+ }
+
+ s.seqSize += ctx.litRemain
+ if s.seqSize > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ err := br.close()
+ if err != nil {
+ printf("Closing sequences: %v, %+v\n", err, *br)
+ }
+ return err
+}
+
+// --------------------------------------------------------------------------------
+
+type executeAsmContext struct {
+ seqs []seqVals
+ seqIndex int
+ out []byte
+ history []byte
+ literals []byte
+ outPosition int
+ litPosition int
+ windowSize int
+}
+
+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
+//
+// Returns false if a match offset is too big.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+
+// Same as above, but with safe memcopies
+//go:noescape
+func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+
+// executeSimple handles cases when dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+ // Ensure we have enough output size...
+ if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
+ addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
+ s.out = append(s.out, make([]byte, addBytes)...)
+ s.out = s.out[:len(s.out)-addBytes]
+ }
+
+ if debugDecoder {
+ printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+ }
+
+ var t = len(s.out)
+ out := s.out[:t+s.seqSize]
+
+ ctx := executeAsmContext{
+ seqs: seqs,
+ seqIndex: 0,
+ out: out,
+ history: hist,
+ outPosition: t,
+ litPosition: 0,
+ literals: s.literals,
+ windowSize: s.windowSize,
+ }
+ var ok bool
+ if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+ ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
+ } else {
+ ok = sequenceDecs_executeSimple_amd64(&ctx)
+ }
+ if !ok {
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)",
+ seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
+ }
+ s.literals = s.literals[ctx.litPosition:]
+ t = ctx.outPosition
+
+ // Add final literals
+ copy(out[t:], s.literals)
+ if debugDecoder {
+ t += len(s.literals)
+ if t != len(out) {
+ panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+ }
+ }
+ s.out = out
+
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
new file mode 100644
index 0000000000..27e76774ca
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -0,0 +1,4100 @@
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ MOVQ 104(AX), R10
+ MOVQ s+0(FP), AX
+ MOVQ 144(AX), R11
+ MOVQ 152(AX), R12
+ MOVQ 160(AX), R13
+
+sequenceDecs_decode_amd64_main_loop:
+ MOVQ (SP), R14
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decode_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R14
+ MOVQ (R14), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decode_amd64_fill_end
+
+sequenceDecs_decode_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decode_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decode_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R14
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R14), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decode_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ SI, $0x08
+ JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R14
+ MOVQ (R14), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decode_amd64_fill_2_end
+
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decode_amd64_fill_2_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decode_amd64_fill_2_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R14
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R14), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_2_end:
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_amd64_ll_update_zero:
+ MOVQ AX, (R10)
+
+ // Fill bitreader for state updates
+ MOVQ R14, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R14
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R14
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R14
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_amd64_skip_update:
+ // Adjust offset
+ MOVQ 16(R10), CX
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
+ MOVQ R12, R13
+ MOVQ R11, R12
+ MOVQ CX, R11
+ JMP sequenceDecs_decode_amd64_after_adjust
+
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
+ CMPQ (R10), $0x00000000
+ JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
+ MOVQ R11, CX
+ JMP sequenceDecs_decode_amd64_after_adjust
+
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_amd64_adjust_zero
+ JEQ sequenceDecs_decode_amd64_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_amd64_adjust_three
+ JMP sequenceDecs_decode_amd64_adjust_two
+
+sequenceDecs_decode_amd64_adjust_zero:
+ MOVQ R11, AX
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_one:
+ MOVQ R12, AX
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_two:
+ MOVQ R13, AX
+ JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_three:
+ LEAQ -1(R11), AX
+
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_amd64_adjust_temp_valid
+ MOVQ $0x00000001, AX
+
+sequenceDecs_decode_amd64_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R12, R13
+ MOVQ R11, R12
+ MOVQ AX, R11
+ MOVQ AX, CX
+
+sequenceDecs_decode_amd64_after_adjust:
+ MOVQ CX, 16(R10)
+
+ // Check values
+ MOVQ 8(R10), AX
+ MOVQ (R10), R14
+ LEAQ (AX)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decode_amd64_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_amd64_match_len_ofs_ok:
+ ADDQ $0x18, R10
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decode_amd64_main_loop
+ MOVQ s+0(FP), AX
+ MOVQ R11, 144(AX)
+ MOVQ R12, 152(AX)
+ MOVQ R13, 160(AX)
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_amd64_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ MOVQ 104(AX), R10
+ MOVQ s+0(FP), AX
+ MOVQ 144(AX), R11
+ MOVQ 152(AX), R12
+ MOVQ 160(AX), R13
+
+sequenceDecs_decode_56_amd64_main_loop:
+ MOVQ (SP), R14
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R14
+ MOVQ (R14), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decode_56_amd64_fill_end
+
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decode_56_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decode_56_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R14
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R14), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_56_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_of_update_zero:
+ MOVQ AX, 16(R10)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ml_update_zero:
+ MOVQ AX, 8(R10)
+
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R15
+ SHLQ CL, R15
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decode_56_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decode_56_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decode_56_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R15
+ ADDQ R15, AX
+
+sequenceDecs_decode_56_amd64_ll_update_zero:
+ MOVQ AX, (R10)
+
+ // Fill bitreader for state updates
+ MOVQ R14, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_56_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R14
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R14
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R14
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R14*1), CX
+ MOVQ DX, R15
+ MOVQ CX, BX
+ ROLQ CL, R15
+ MOVL $0x00000001, BP
+ MOVB R14, CL
+ SHLL CL, BP
+ DECL BP
+ ANDQ BP, R15
+ ADDQ R15, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_56_amd64_skip_update:
+ // Adjust offset
+ MOVQ 16(R10), CX
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
+ MOVQ R12, R13
+ MOVQ R11, R12
+ MOVQ CX, R11
+ JMP sequenceDecs_decode_56_amd64_after_adjust
+
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
+ CMPQ (R10), $0x00000000
+ JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+ MOVQ R11, CX
+ JMP sequenceDecs_decode_56_amd64_after_adjust
+
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_56_amd64_adjust_zero
+ JEQ sequenceDecs_decode_56_amd64_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_56_amd64_adjust_three
+ JMP sequenceDecs_decode_56_amd64_adjust_two
+
+sequenceDecs_decode_56_amd64_adjust_zero:
+ MOVQ R11, AX
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_one:
+ MOVQ R12, AX
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_two:
+ MOVQ R13, AX
+ JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_three:
+ LEAQ -1(R11), AX
+
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
+ MOVQ $0x00000001, AX
+
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R12, R13
+ MOVQ R11, R12
+ MOVQ AX, R11
+ MOVQ AX, CX
+
+sequenceDecs_decode_56_amd64_after_adjust:
+ MOVQ CX, 16(R10)
+
+ // Check values
+ MOVQ 8(R10), AX
+ MOVQ (R10), R14
+ LEAQ (AX)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decode_56_amd64_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
+ ADDQ $0x18, R10
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decode_56_amd64_main_loop
+ MOVQ s+0(FP), AX
+ MOVQ R11, 144(AX)
+ MOVQ R12, 152(AX)
+ MOVQ R13, 160(AX)
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ MOVQ 104(CX), R9
+ MOVQ s+0(FP), CX
+ MOVQ 144(CX), R10
+ MOVQ 152(CX), R11
+ MOVQ 160(CX), R12
+
+sequenceDecs_decode_bmi2_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decode_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R13
+ MOVQ (R13), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decode_bmi2_fill_end
+
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decode_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decode_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R13
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R13), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 16(R9)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 8(R9)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ BX, $0x08
+ JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R13
+ MOVQ (R13), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decode_bmi2_fill_2_end
+
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decode_bmi2_fill_2_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decode_bmi2_fill_2_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R13
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R13), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_2_end:
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, (R9)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_bmi2_skip_update
+ LEAQ (SI)(DI*1), R14
+ ADDQ R8, R14
+ MOVBQZX R14, R14
+ LEAQ (DX)(R14*1), CX
+ MOVQ AX, R15
+ MOVQ CX, DX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+
+ // Update Offset State
+ BZHIQ R8, R15, CX
+ SHRXQ R8, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R15, CX
+ SHRXQ DI, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R15, CX
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decode_bmi2_skip_update:
+ // Adjust offset
+ MOVQ 16(R9), CX
+ CMPQ R13, $0x01
+ JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
+ MOVQ R11, R12
+ MOVQ R10, R11
+ MOVQ CX, R10
+ JMP sequenceDecs_decode_bmi2_after_adjust
+
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
+ CMPQ (R9), $0x00000000
+ JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
+ MOVQ R10, CX
+ JMP sequenceDecs_decode_bmi2_after_adjust
+
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_bmi2_adjust_zero
+ JEQ sequenceDecs_decode_bmi2_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_bmi2_adjust_three
+ JMP sequenceDecs_decode_bmi2_adjust_two
+
+sequenceDecs_decode_bmi2_adjust_zero:
+ MOVQ R10, R13
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_one:
+ MOVQ R11, R13
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_two:
+ MOVQ R12, R13
+ JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_three:
+ LEAQ -1(R10), R13
+
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R13
+
+sequenceDecs_decode_bmi2_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R11, R12
+ MOVQ R10, R11
+ MOVQ R13, R10
+ MOVQ R13, CX
+
+sequenceDecs_decode_bmi2_after_adjust:
+ MOVQ CX, 16(R9)
+
+ // Check values
+ MOVQ 8(R9), R13
+ MOVQ (R9), R14
+ LEAQ (R13)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ R13, $0x00020002
+ JA sequenceDecs_decode_bmi2_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
+ ADDQ $0x18, R9
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decode_bmi2_main_loop
+ MOVQ s+0(FP), CX
+ MOVQ R10, 144(CX)
+ MOVQ R11, 152(CX)
+ MOVQ R12, 160(CX)
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_bmi2_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ MOVQ 104(CX), R9
+ MOVQ s+0(FP), CX
+ MOVQ 144(CX), R10
+ MOVQ 152(CX), R11
+ MOVQ 160(CX), R12
+
+sequenceDecs_decode_56_bmi2_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R13
+ MOVQ (R13), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decode_56_bmi2_fill_end
+
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decode_56_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decode_56_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R13
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R13), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_56_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 16(R9)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, 8(R9)
+
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R14
+ MOVQ AX, R15
+ LEAQ (DX)(R14*1), CX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R15, CX
+ MOVQ CX, (R9)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decode_56_bmi2_skip_update
+ LEAQ (SI)(DI*1), R14
+ ADDQ R8, R14
+ MOVBQZX R14, R14
+ LEAQ (DX)(R14*1), CX
+ MOVQ AX, R15
+ MOVQ CX, DX
+ ROLQ CL, R15
+ BZHIQ R14, R15, R15
+
+ // Update Offset State
+ BZHIQ R8, R15, CX
+ SHRXQ R8, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R15, CX
+ SHRXQ DI, R15, R15
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R15, CX
+ MOVQ $0x00001010, R14
+ BEXTRQ R14, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decode_56_bmi2_skip_update:
+ // Adjust offset
+ MOVQ 16(R9), CX
+ CMPQ R13, $0x01
+ JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
+ MOVQ R11, R12
+ MOVQ R10, R11
+ MOVQ CX, R10
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
+
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
+ CMPQ (R9), $0x00000000
+ JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
+ INCQ CX
+ JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+ MOVQ R10, CX
+ JMP sequenceDecs_decode_56_bmi2_after_adjust
+
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
+ CMPQ CX, $0x01
+ JB sequenceDecs_decode_56_bmi2_adjust_zero
+ JEQ sequenceDecs_decode_56_bmi2_adjust_one
+ CMPQ CX, $0x02
+ JA sequenceDecs_decode_56_bmi2_adjust_three
+ JMP sequenceDecs_decode_56_bmi2_adjust_two
+
+sequenceDecs_decode_56_bmi2_adjust_zero:
+ MOVQ R10, R13
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_one:
+ MOVQ R11, R13
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_two:
+ MOVQ R12, R13
+ JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_three:
+ LEAQ -1(R10), R13
+
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R13
+
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
+ CMPQ CX, $0x01
+ CMOVQNE R11, R12
+ MOVQ R10, R11
+ MOVQ R13, R10
+ MOVQ R13, CX
+
+sequenceDecs_decode_56_bmi2_after_adjust:
+ MOVQ CX, 16(R9)
+
+ // Check values
+ MOVQ 8(R9), R13
+ MOVQ (R9), R14
+ LEAQ (R13)(R14*1), R15
+ MOVQ s+0(FP), BP
+ ADDQ R15, 256(BP)
+ MOVQ ctx+16(FP), R15
+ SUBQ R14, 128(R15)
+ JS error_not_enough_literals
+ CMPQ R13, $0x00020002
+ JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
+ TESTQ CX, CX
+ JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
+ TESTQ R13, R13
+ JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
+ ADDQ $0x18, R9
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decode_56_bmi2_main_loop
+ MOVQ s+0(FP), CX
+ MOVQ R10, 144(CX)
+ MOVQ R11, 152(CX)
+ MOVQ R12, 160(CX)
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
+ MOVQ ctx+0(FP), R10
+ MOVQ 8(R10), CX
+ TESTQ CX, CX
+ JZ empty_seqs
+ MOVQ (R10), AX
+ MOVQ 24(R10), DX
+ MOVQ 32(R10), BX
+ MOVQ 80(R10), SI
+ MOVQ 104(R10), DI
+ MOVQ 120(R10), R8
+ MOVQ 56(R10), R9
+ MOVQ 64(R10), R10
+ ADDQ R10, R9
+
+ // seqsBase += 24 * seqIndex
+ LEAQ (DX)(DX*2), R11
+ SHLQ $0x03, R11
+ ADDQ R11, AX
+
+ // outBase += outPosition
+ ADDQ DI, BX
+
+main_loop:
+ MOVQ (AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 8(AX), R13
+
+ // Copy literals
+ TESTQ R11, R11
+ JZ check_offset
+ XORQ R14, R14
+
+copy_1:
+ MOVUPS (SI)(R14*1), X0
+ MOVUPS X0, (BX)(R14*1)
+ ADDQ $0x10, R14
+ CMPQ R14, R11
+ JB copy_1
+ ADDQ R11, SI
+ ADDQ R11, BX
+ ADDQ R11, DI
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ LEAQ (DI)(R10*1), R11
+ CMPQ R12, R11
+ JG error_match_off_too_big
+ CMPQ R12, R8
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+
+copy_4_end:
+ ADDQ R13, DI
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+
+copy_5_end:
+ ADDQ R11, DI
+ SUBQ R11, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ BX, R11
+ SUBQ R12, R11
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, DI
+ MOVQ BX, R12
+ ADDQ R13, BX
+
+copy_2:
+ MOVUPS (R11), X0
+ MOVUPS X0, (R12)
+ ADDQ $0x10, R11
+ ADDQ $0x10, R12
+ SUBQ $0x10, R13
+ JHI copy_2
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, DI
+
+copy_slow_3:
+ MOVB (R11), R12
+ MOVB R12, (BX)
+ INCQ R11
+ INCQ BX
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+
+loop_finished:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+error_match_off_too_big:
+ // Return value
+ MOVB $0x00, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+empty_seqs:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+ RET
+
+// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
+ MOVQ ctx+0(FP), R10
+ MOVQ 8(R10), CX
+ TESTQ CX, CX
+ JZ empty_seqs
+ MOVQ (R10), AX
+ MOVQ 24(R10), DX
+ MOVQ 32(R10), BX
+ MOVQ 80(R10), SI
+ MOVQ 104(R10), DI
+ MOVQ 120(R10), R8
+ MOVQ 56(R10), R9
+ MOVQ 64(R10), R10
+ ADDQ R10, R9
+
+ // seqsBase += 24 * seqIndex
+ LEAQ (DX)(DX*2), R11
+ SHLQ $0x03, R11
+ ADDQ R11, AX
+
+ // outBase += outPosition
+ ADDQ DI, BX
+
+main_loop:
+ MOVQ (AX), R11
+ MOVQ 16(AX), R12
+ MOVQ 8(AX), R13
+
+ // Copy literals
+ TESTQ R11, R11
+ JZ check_offset
+ MOVQ R11, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (SI), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, SI
+ ADDQ $0x10, BX
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(SI)(R14*1), SI
+ LEAQ 16(BX)(R14*1), BX
+ MOVUPS -16(SI), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ R11, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ R11, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (SI), R14
+ MOVB -1(SI)(R11*1), R15
+ MOVB R14, (BX)
+ MOVB R15, -1(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (SI), R14
+ MOVB 2(SI), R15
+ MOVW R14, (BX)
+ MOVB R15, 2(BX)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (SI), R14
+ MOVL -4(SI)(R11*1), R15
+ MOVL R14, (BX)
+ MOVL R15, -4(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (SI), R14
+ MOVQ -8(SI)(R11*1), R15
+ MOVQ R14, (BX)
+ MOVQ R15, -8(BX)(R11*1)
+ ADDQ R11, SI
+ ADDQ R11, BX
+
+copy_1_end:
+ ADDQ R11, DI
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ LEAQ (DI)(R10*1), R11
+ CMPQ R12, R11
+ JG error_match_off_too_big
+ CMPQ R12, R8
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, R11
+ SUBQ DI, R11
+ JLS copy_match
+ MOVQ R9, R14
+ SUBQ R11, R14
+ CMPQ R13, R11
+ JG copy_all_from_history
+ MOVQ R13, R11
+ SUBQ $0x10, R11
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R11
+ JAE copy_4_loop
+ LEAQ 16(R14)(R11*1), R14
+ LEAQ 16(BX)(R11*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), R11
+ MOVB 2(R14), R12
+ MOVW R11, (BX)
+ MOVB R12, 2(BX)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), R11
+ MOVL -4(R14)(R13*1), R12
+ MOVL R11, (BX)
+ MOVL R12, -4(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), R11
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ R11, (BX)
+ MOVQ R12, -8(BX)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, BX
+
+copy_4_end:
+ ADDQ R13, DI
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ R11, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R14
+ ADDQ $0x10, BX
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(BX)(R15*1), BX
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ R11, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ R11, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(R11*1), BP
+ MOVB R15, (BX)
+ MOVB BP, -1(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (BX)
+ MOVB BP, 2(BX)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(R11*1), BP
+ MOVL R15, (BX)
+ MOVL BP, -4(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(R11*1), BP
+ MOVQ R15, (BX)
+ MOVQ BP, -8(BX)(R11*1)
+ ADDQ R11, R14
+ ADDQ R11, BX
+
+copy_5_end:
+ ADDQ R11, DI
+ SUBQ R11, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ BX, R11
+ SUBQ R12, R11
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, DI
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
+
+copy_2_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (BX)
+ ADDQ $0x10, R11
+ ADDQ $0x10, BX
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(R11)(R12*1), R11
+ LEAQ 16(BX)(R12*1), BX
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(BX)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (R11), R12
+ MOVB -1(R11)(R13*1), R14
+ MOVB R12, (BX)
+ MOVB R14, -1(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (R11), R12
+ MOVB 2(R11), R14
+ MOVW R12, (BX)
+ MOVB R14, 2(BX)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (R11), R12
+ MOVL -4(R11)(R13*1), R14
+ MOVL R12, (BX)
+ MOVL R14, -4(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (R11), R12
+ MOVQ -8(R11)(R13*1), R14
+ MOVQ R12, (BX)
+ MOVQ R14, -8(BX)(R13*1)
+ ADDQ R13, R11
+ ADDQ R13, BX
+
+copy_2_end:
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, DI
+
+copy_slow_3:
+ MOVB (R11), R12
+ MOVB R12, (BX)
+ INCQ R11
+ INCQ BX
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ ADDQ $0x18, AX
+ INCQ DX
+ CMPQ DX, CX
+ JB main_loop
+
+loop_finished:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+error_match_off_too_big:
+ // Return value
+ MOVB $0x00, ret+8(FP)
+
+ // Update the context
+ MOVQ ctx+0(FP), AX
+ MOVQ DX, 24(AX)
+ MOVQ DI, 104(AX)
+ MOVQ 80(AX), CX
+ SUBQ CX, SI
+ MOVQ SI, 112(AX)
+ RET
+
+empty_seqs:
+ // Return value
+ MOVB $0x01, ret+8(FP)
+ RET
+
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
+ MOVQ 112(AX), R10
+ MOVQ 128(AX), CX
+ MOVQ CX, 32(SP)
+ MOVQ 144(AX), R11
+ MOVQ 136(AX), R12
+ MOVQ 200(AX), CX
+ MOVQ CX, 56(SP)
+ MOVQ 176(AX), CX
+ MOVQ CX, 48(SP)
+ MOVQ 184(AX), AX
+ MOVQ AX, 40(SP)
+ MOVQ 40(SP), AX
+ ADDQ AX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R10, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R12, R10
+
+sequenceDecs_decodeSync_amd64_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_amd64_fill_end
+
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_amd64_fill_2_end
+
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_amd64_fill_2_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_amd64_fill_2_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_2_end:
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R13
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R13
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R13
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_amd64_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
+
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_amd64_after_adjust
+
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
+ MOVQ R13, AX
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, AX
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(AX*8), R14
+ JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_amd64_adjust_skip
+ MOVQ 152(CX), AX
+ MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_amd64_adjust_skip:
+ MOVQ 144(CX), AX
+ MOVQ AX, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_amd64_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), AX
+ MOVQ 24(SP), CX
+ LEAQ (AX)(CX*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ CX, 104(R14)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
+ MOVQ 24(SP), AX
+ MOVQ 8(SP), CX
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (AX)(R13*1), R14
+ ADDQ R10, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ AX, AX
+ JZ check_offset
+ XORQ R14, R14
+
+copy_1:
+ MOVUPS (R11)(R14*1), X0
+ MOVUPS X0, (R10)(R14*1)
+ ADDQ $0x10, R14
+ CMPQ R14, AX
+ JB copy_1
+ ADDQ AX, R11
+ ADDQ AX, R10
+ ADDQ AX, R12
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R12, AX
+ ADDQ 40(SP), AX
+ CMPQ CX, AX
+ JG error_match_off_too_big
+ CMPQ CX, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+
+copy_5_end:
+ ADDQ AX, R12
+ SUBQ AX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R10, AX
+ SUBQ CX, AX
+
+ // ml <= mo
+ CMPQ R13, CX
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R12
+ MOVQ R10, CX
+ ADDQ R13, R10
+
+copy_2:
+ MOVUPS (AX), X0
+ MOVUPS X0, (CX)
+ ADDQ $0x10, AX
+ ADDQ $0x10, CX
+ SUBQ $0x10, R13
+ JHI copy_2
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R12
+
+copy_slow_3:
+ MOVB (AX), CL
+ MOVB CL, (R10)
+ INCQ AX
+ INCQ R10
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decodeSync_amd64_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R12, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R11
+ MOVQ R11, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
+ MOVQ 112(CX), R9
+ MOVQ 128(CX), R10
+ MOVQ R10, 32(SP)
+ MOVQ 144(CX), R10
+ MOVQ 136(CX), R11
+ MOVQ 200(CX), R12
+ MOVQ R12, 56(SP)
+ MOVQ 176(CX), R12
+ MOVQ R12, 48(SP)
+ MOVQ 184(CX), CX
+ MOVQ CX, 40(SP)
+ MOVQ 40(SP), CX
+ ADDQ CX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R9, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R11, R9
+
+sequenceDecs_decodeSync_bmi2_main_loop:
+ MOVQ (SP), R12
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_bmi2_fill_end
+
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 8(SP)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_bmi2_fill_2_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_bmi2_fill_2_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_2_end:
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R12, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R12
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_bmi2_skip_update
+ LEAQ (SI)(DI*1), R13
+ ADDQ R8, R13
+ MOVBQZX R13, R13
+ LEAQ (DX)(R13*1), CX
+ MOVQ AX, R14
+ MOVQ CX, DX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+
+ // Update Offset State
+ BZHIQ R8, R14, CX
+ SHRXQ R8, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R14, CX
+ SHRXQ DI, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R14, CX
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decodeSync_bmi2_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ R12, $0x01
+ JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
+
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_bmi2_after_adjust
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
+ MOVQ R13, R12
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, R12
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(R12*8), R14
+ JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_bmi2_adjust_skip
+ MOVQ 152(CX), R12
+ MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_bmi2_adjust_skip:
+ MOVQ 144(CX), R12
+ MOVQ R12, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_bmi2_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), CX
+ MOVQ 24(SP), R12
+ LEAQ (CX)(R12*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ R12, 104(R14)
+ JS error_not_enough_literals
+ CMPQ CX, $0x00020002
+ JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
+ TESTQ CX, CX
+ JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
+ MOVQ 24(SP), CX
+ MOVQ 8(SP), R12
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (CX)(R13*1), R14
+ ADDQ R9, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ CX, CX
+ JZ check_offset
+ XORQ R14, R14
+
+copy_1:
+ MOVUPS (R10)(R14*1), X0
+ MOVUPS X0, (R9)(R14*1)
+ ADDQ $0x10, R14
+ CMPQ R14, CX
+ JB copy_1
+ ADDQ CX, R10
+ ADDQ CX, R9
+ ADDQ CX, R11
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R11, CX
+ ADDQ 40(SP), CX
+ CMPQ R12, CX
+ JG error_match_off_too_big
+ CMPQ R12, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+
+copy_4_end:
+ ADDQ R13, R11
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
+ ADDQ CX, R11
+ SUBQ CX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R9, CX
+ SUBQ R12, CX
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R11
+ MOVQ R9, R12
+ ADDQ R13, R9
+
+copy_2:
+ MOVUPS (CX), X0
+ MOVUPS X0, (R12)
+ ADDQ $0x10, CX
+ ADDQ $0x10, R12
+ SUBQ $0x10, R13
+ JHI copy_2
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R11
+
+copy_slow_3:
+ MOVB (CX), R12
+ MOVB R12, (R9)
+ INCQ CX
+ INCQ R9
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decodeSync_bmi2_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R11, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R10
+ MOVQ R10, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
+ MOVQ br+8(FP), AX
+ MOVQ 32(AX), DX
+ MOVBQZX 40(AX), BX
+ MOVQ 24(AX), SI
+ MOVQ (AX), AX
+ ADDQ SI, AX
+ MOVQ AX, (SP)
+ MOVQ ctx+16(FP), AX
+ MOVQ 72(AX), DI
+ MOVQ 80(AX), R8
+ MOVQ 88(AX), R9
+ XORQ CX, CX
+ MOVQ CX, 8(SP)
+ MOVQ CX, 16(SP)
+ MOVQ CX, 24(SP)
+ MOVQ 112(AX), R10
+ MOVQ 128(AX), CX
+ MOVQ CX, 32(SP)
+ MOVQ 144(AX), R11
+ MOVQ 136(AX), R12
+ MOVQ 200(AX), CX
+ MOVQ CX, 56(SP)
+ MOVQ 176(AX), CX
+ MOVQ CX, 48(SP)
+ MOVQ 184(AX), AX
+ MOVQ AX, 40(SP)
+ MOVQ 40(SP), AX
+ ADDQ AX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R10, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R12, R10
+
+sequenceDecs_decodeSync_safe_amd64_main_loop:
+ MOVQ (SP), R13
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_end:
+ // Update offset
+ MOVQ R9, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
+ MOVQ AX, 8(SP)
+
+ // Update match length
+ MOVQ R8, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
+ MOVQ AX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ SI, $0x08
+ JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+ MOVQ BX, AX
+ SHRQ $0x03, AX
+ SUBQ AX, R13
+ MOVQ (R13), DX
+ SUBQ AX, SI
+ ANDQ $0x07, BX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
+ CMPQ SI, $0x00
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
+ CMPQ BX, $0x07
+ JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
+ SHLQ $0x08, DX
+ SUBQ $0x01, R13
+ SUBQ $0x01, SI
+ SUBQ $0x08, BX
+ MOVBQZX (R13), AX
+ ORQ AX, DX
+ JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
+ // Update literal length
+ MOVQ DI, AX
+ MOVQ BX, CX
+ MOVQ DX, R14
+ SHLQ CL, R14
+ MOVB AH, CL
+ SHRQ $0x20, AX
+ TESTQ CX, CX
+ JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ ADDQ CX, BX
+ CMPQ BX, $0x40
+ JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ CMPQ CX, $0x40
+ JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+ NEGQ CX
+ SHRQ CL, R14
+ ADDQ R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
+ MOVQ AX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R13, (SP)
+ MOVQ R9, AX
+ SHRQ $0x08, AX
+ MOVBQZX AL, AX
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_safe_amd64_skip_update
+
+ // Update Literal Length State
+ MOVBQZX DI, R13
+ SHRQ $0x10, DI
+ MOVWQZX DI, DI
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, DI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Match Length State
+ MOVBQZX R8, R13
+ SHRQ $0x10, R8
+ MOVWQZX R8, R8
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R8
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Offset State
+ MOVBQZX R9, R13
+ SHRQ $0x10, R9
+ MOVWQZX R9, R9
+ LEAQ (BX)(R13*1), CX
+ MOVQ DX, R14
+ MOVQ CX, BX
+ ROLQ CL, R14
+ MOVL $0x00000001, R15
+ MOVB R13, CL
+ SHLL CL, R15
+ DECL R15
+ ANDQ R15, R14
+ ADDQ R14, R9
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_safe_amd64_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ AX, $0x01
+ JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
+ MOVQ R13, AX
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, AX
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(AX*8), R14
+ JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
+ MOVQ 152(CX), AX
+ MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
+ MOVQ 144(CX), AX
+ MOVQ AX, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), AX
+ MOVQ 24(SP), CX
+ LEAQ (AX)(CX*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ CX, 104(R14)
+ JS error_not_enough_literals
+ CMPQ AX, $0x00020002
+ JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
+ TESTQ AX, AX
+ JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
+ MOVQ 24(SP), AX
+ MOVQ 8(SP), CX
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (AX)(R13*1), R14
+ ADDQ R10, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ AX, AX
+ JZ check_offset
+ MOVQ AX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (R11), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R11
+ ADDQ $0x10, R10
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R11)(R14*1), R11
+ LEAQ 16(R10)(R14*1), R10
+ MOVUPS -16(R11), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ AX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ AX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R11), R14
+ MOVB -1(R11)(AX*1), R15
+ MOVB R14, (R10)
+ MOVB R15, -1(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (R11), R14
+ MOVB 2(R11), R15
+ MOVW R14, (R10)
+ MOVB R15, 2(R10)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R11), R14
+ MOVL -4(R11)(AX*1), R15
+ MOVL R14, (R10)
+ MOVL R15, -4(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (R11), R14
+ MOVQ -8(R11)(AX*1), R15
+ MOVQ R14, (R10)
+ MOVQ R15, -8(R10)(AX*1)
+ ADDQ AX, R11
+ ADDQ AX, R10
+
+copy_1_end:
+ ADDQ AX, R12
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R12, AX
+ ADDQ 40(SP), AX
+ CMPQ CX, AX
+ JG error_match_off_too_big
+ CMPQ CX, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ CX, AX
+ SUBQ R12, AX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ AX, R14
+ CMPQ R13, AX
+ JG copy_all_from_history
+ MOVQ R13, AX
+ SUBQ $0x10, AX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, AX
+ JAE copy_4_loop
+ LEAQ 16(R14)(AX*1), R14
+ LEAQ 16(R10)(AX*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), AX
+ MOVB 2(R14), CL
+ MOVW AX, (R10)
+ MOVB CL, 2(R10)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), AX
+ MOVL -4(R14)(R13*1), CX
+ MOVL AX, (R10)
+ MOVL CX, -4(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), AX
+ MOVQ -8(R14)(R13*1), CX
+ MOVQ AX, (R10)
+ MOVQ CX, -8(R10)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R10
+
+copy_4_end:
+ ADDQ R13, R12
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ AX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R10
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R10)(R15*1), R10
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ AX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ AX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(AX*1), BP
+ MOVB R15, (R10)
+ MOVB BP, -1(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R10)
+ MOVB BP, 2(R10)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(AX*1), BP
+ MOVL R15, (R10)
+ MOVL BP, -4(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(AX*1), BP
+ MOVQ R15, (R10)
+ MOVQ BP, -8(R10)(AX*1)
+ ADDQ AX, R14
+ ADDQ AX, R10
+
+copy_5_end:
+ ADDQ AX, R12
+ SUBQ AX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R10, AX
+ SUBQ CX, AX
+
+ // ml <= mo
+ CMPQ R13, CX
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R12
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_2_small
+
+copy_2_loop:
+ MOVUPS (AX), X0
+ MOVUPS X0, (R10)
+ ADDQ $0x10, AX
+ ADDQ $0x10, R10
+ SUBQ $0x10, CX
+ JAE copy_2_loop
+ LEAQ 16(AX)(CX*1), AX
+ LEAQ 16(R10)(CX*1), R10
+ MOVUPS -16(AX), X0
+ MOVUPS X0, -16(R10)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (AX), CL
+ MOVB -1(AX)(R13*1), R14
+ MOVB CL, (R10)
+ MOVB R14, -1(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (AX), CX
+ MOVB 2(AX), R14
+ MOVW CX, (R10)
+ MOVB R14, 2(R10)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (AX), CX
+ MOVL -4(AX)(R13*1), R14
+ MOVL CX, (R10)
+ MOVL R14, -4(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (AX), CX
+ MOVQ -8(AX)(R13*1), R14
+ MOVQ CX, (R10)
+ MOVQ R14, -8(R10)(R13*1)
+ ADDQ R13, AX
+ ADDQ R13, R10
+
+copy_2_end:
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R12
+
+copy_slow_3:
+ MOVB (AX), CL
+ MOVB CL, (R10)
+ INCQ AX
+ INCQ R10
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), AX
+ DECQ 96(AX)
+ JNS sequenceDecs_decodeSync_safe_amd64_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), AX
+ MOVQ DX, 32(AX)
+ MOVB BL, 40(AX)
+ MOVQ SI, 24(AX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R12, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R11
+ MOVQ R11, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R12, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
+
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
+ MOVQ br+8(FP), CX
+ MOVQ 32(CX), AX
+ MOVBQZX 40(CX), DX
+ MOVQ 24(CX), BX
+ MOVQ (CX), CX
+ ADDQ BX, CX
+ MOVQ CX, (SP)
+ MOVQ ctx+16(FP), CX
+ MOVQ 72(CX), SI
+ MOVQ 80(CX), DI
+ MOVQ 88(CX), R8
+ XORQ R9, R9
+ MOVQ R9, 8(SP)
+ MOVQ R9, 16(SP)
+ MOVQ R9, 24(SP)
+ MOVQ 112(CX), R9
+ MOVQ 128(CX), R10
+ MOVQ R10, 32(SP)
+ MOVQ 144(CX), R10
+ MOVQ 136(CX), R11
+ MOVQ 200(CX), R12
+ MOVQ R12, 56(SP)
+ MOVQ 176(CX), R12
+ MOVQ R12, 48(SP)
+ MOVQ 184(CX), CX
+ MOVQ CX, 40(SP)
+ MOVQ 40(SP), CX
+ ADDQ CX, 48(SP)
+
+ // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+ ADDQ R9, 32(SP)
+
+ // outBase += outPosition
+ ADDQ R11, R9
+
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
+ MOVQ (SP), R12
+
+ // Fill bitreader to have enough for the offset and match length.
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
+ // Update offset
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ R8, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 8(SP)
+
+ // Update match length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, DI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ DI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 16(SP)
+
+ // Fill bitreader to have enough for the remaining
+ CMPQ BX, $0x08
+ JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+ MOVQ DX, CX
+ SHRQ $0x03, CX
+ SUBQ CX, R12
+ MOVQ (R12), AX
+ SUBQ CX, BX
+ ANDQ $0x07, DX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
+ CMPQ BX, $0x00
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+ CMPQ DX, $0x07
+ JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+ SHLQ $0x08, AX
+ SUBQ $0x01, R12
+ SUBQ $0x01, BX
+ SUBQ $0x08, DX
+ MOVBQZX (R12), CX
+ ORQ CX, AX
+ JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
+ // Update literal length
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, SI, R13
+ MOVQ AX, R14
+ LEAQ (DX)(R13*1), CX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+ MOVQ CX, DX
+ MOVQ SI, CX
+ SHRQ $0x20, CX
+ ADDQ R14, CX
+ MOVQ CX, 24(SP)
+
+ // Fill bitreader for state updates
+ MOVQ R12, (SP)
+ MOVQ $0x00000808, CX
+ BEXTRQ CX, R8, R12
+ MOVQ ctx+16(FP), CX
+ CMPQ 96(CX), $0x00
+ JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
+ LEAQ (SI)(DI*1), R13
+ ADDQ R8, R13
+ MOVBQZX R13, R13
+ LEAQ (DX)(R13*1), CX
+ MOVQ AX, R14
+ MOVQ CX, DX
+ ROLQ CL, R14
+ BZHIQ R13, R14, R14
+
+ // Update Offset State
+ BZHIQ R8, R14, CX
+ SHRXQ R8, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, R8, R8
+ ADDQ CX, R8
+
+ // Load ctx.ofTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 48(CX), CX
+ MOVQ (CX)(R8*8), R8
+
+ // Update Match Length State
+ BZHIQ DI, R14, CX
+ SHRXQ DI, R14, R14
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, DI, DI
+ ADDQ CX, DI
+
+ // Load ctx.mlTable
+ MOVQ ctx+16(FP), CX
+ MOVQ 24(CX), CX
+ MOVQ (CX)(DI*8), DI
+
+ // Update Literal Length State
+ BZHIQ SI, R14, CX
+ MOVQ $0x00001010, R13
+ BEXTRQ R13, SI, SI
+ ADDQ CX, SI
+
+ // Load ctx.llTable
+ MOVQ ctx+16(FP), CX
+ MOVQ (CX), CX
+ MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
+ // Adjust offset
+ MOVQ s+0(FP), CX
+ MOVQ 8(SP), R13
+ CMPQ R12, $0x01
+ JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
+ MOVUPS 144(CX), X0
+ MOVQ R13, 144(CX)
+ MOVUPS X0, 152(CX)
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
+ CMPQ 24(SP), $0x00000000
+ JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
+ INCQ R13
+ JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+ MOVQ 144(CX), R13
+ JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
+ MOVQ R13, R12
+ XORQ R14, R14
+ MOVQ $-1, R15
+ CMPQ R13, $0x03
+ CMOVQEQ R14, R12
+ CMOVQEQ R15, R14
+ ADDQ 144(CX)(R12*8), R14
+ JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
+ MOVQ $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
+ CMPQ R13, $0x01
+ JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
+ MOVQ 152(CX), R12
+ MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
+ MOVQ 144(CX), R12
+ MOVQ R12, 152(CX)
+ MOVQ R14, 144(CX)
+ MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
+ MOVQ R13, 8(SP)
+
+ // Check values
+ MOVQ 16(SP), CX
+ MOVQ 24(SP), R12
+ LEAQ (CX)(R12*1), R14
+ MOVQ s+0(FP), R15
+ ADDQ R14, 256(R15)
+ MOVQ ctx+16(FP), R14
+ SUBQ R12, 104(R14)
+ JS error_not_enough_literals
+ CMPQ CX, $0x00020002
+ JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
+ TESTQ R13, R13
+ JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
+ TESTQ CX, CX
+ JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
+ MOVQ 24(SP), CX
+ MOVQ 8(SP), R12
+ MOVQ 16(SP), R13
+
+ // Check if we have enough space in s.out
+ LEAQ (CX)(R13*1), R14
+ ADDQ R9, R14
+ CMPQ R14, 32(SP)
+ JA error_not_enough_space
+
+ // Copy literals
+ TESTQ CX, CX
+ JZ check_offset
+ MOVQ CX, R14
+ SUBQ $0x10, R14
+ JB copy_1_small
+
+copy_1_loop:
+ MOVUPS (R10), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R10
+ ADDQ $0x10, R9
+ SUBQ $0x10, R14
+ JAE copy_1_loop
+ LEAQ 16(R10)(R14*1), R10
+ LEAQ 16(R9)(R14*1), R9
+ MOVUPS -16(R10), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_1_end
+
+copy_1_small:
+ CMPQ CX, $0x03
+ JE copy_1_move_3
+ JB copy_1_move_1or2
+ CMPQ CX, $0x08
+ JB copy_1_move_4through7
+ JMP copy_1_move_8through16
+
+copy_1_move_1or2:
+ MOVB (R10), R14
+ MOVB -1(R10)(CX*1), R15
+ MOVB R14, (R9)
+ MOVB R15, -1(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_3:
+ MOVW (R10), R14
+ MOVB 2(R10), R15
+ MOVW R14, (R9)
+ MOVB R15, 2(R9)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_4through7:
+ MOVL (R10), R14
+ MOVL -4(R10)(CX*1), R15
+ MOVL R14, (R9)
+ MOVL R15, -4(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+ JMP copy_1_end
+
+copy_1_move_8through16:
+ MOVQ (R10), R14
+ MOVQ -8(R10)(CX*1), R15
+ MOVQ R14, (R9)
+ MOVQ R15, -8(R9)(CX*1)
+ ADDQ CX, R10
+ ADDQ CX, R9
+
+copy_1_end:
+ ADDQ CX, R11
+
+ // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+ MOVQ R11, CX
+ ADDQ 40(SP), CX
+ CMPQ R12, CX
+ JG error_match_off_too_big
+ CMPQ R12, 56(SP)
+ JG error_match_off_too_big
+
+ // Copy match from history
+ MOVQ R12, CX
+ SUBQ R11, CX
+ JLS copy_match
+ MOVQ 48(SP), R14
+ SUBQ CX, R14
+ CMPQ R13, CX
+ JG copy_all_from_history
+ MOVQ R13, CX
+ SUBQ $0x10, CX
+ JB copy_4_small
+
+copy_4_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, CX
+ JAE copy_4_loop
+ LEAQ 16(R14)(CX*1), R14
+ LEAQ 16(R9)(CX*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_4_end
+
+copy_4_small:
+ CMPQ R13, $0x03
+ JE copy_4_move_3
+ CMPQ R13, $0x08
+ JB copy_4_move_4through7
+ JMP copy_4_move_8through16
+
+copy_4_move_3:
+ MOVW (R14), CX
+ MOVB 2(R14), R12
+ MOVW CX, (R9)
+ MOVB R12, 2(R9)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_4through7:
+ MOVL (R14), CX
+ MOVL -4(R14)(R13*1), R12
+ MOVL CX, (R9)
+ MOVL R12, -4(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+ JMP copy_4_end
+
+copy_4_move_8through16:
+ MOVQ (R14), CX
+ MOVQ -8(R14)(R13*1), R12
+ MOVQ CX, (R9)
+ MOVQ R12, -8(R9)(R13*1)
+ ADDQ R13, R14
+ ADDQ R13, R9
+
+copy_4_end:
+ ADDQ R13, R11
+ JMP handle_loop
+ JMP loop_finished
+
+copy_all_from_history:
+ MOVQ CX, R15
+ SUBQ $0x10, R15
+ JB copy_5_small
+
+copy_5_loop:
+ MOVUPS (R14), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, R14
+ ADDQ $0x10, R9
+ SUBQ $0x10, R15
+ JAE copy_5_loop
+ LEAQ 16(R14)(R15*1), R14
+ LEAQ 16(R9)(R15*1), R9
+ MOVUPS -16(R14), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_5_end
+
+copy_5_small:
+ CMPQ CX, $0x03
+ JE copy_5_move_3
+ JB copy_5_move_1or2
+ CMPQ CX, $0x08
+ JB copy_5_move_4through7
+ JMP copy_5_move_8through16
+
+copy_5_move_1or2:
+ MOVB (R14), R15
+ MOVB -1(R14)(CX*1), BP
+ MOVB R15, (R9)
+ MOVB BP, -1(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_3:
+ MOVW (R14), R15
+ MOVB 2(R14), BP
+ MOVW R15, (R9)
+ MOVB BP, 2(R9)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_4through7:
+ MOVL (R14), R15
+ MOVL -4(R14)(CX*1), BP
+ MOVL R15, (R9)
+ MOVL BP, -4(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+ JMP copy_5_end
+
+copy_5_move_8through16:
+ MOVQ (R14), R15
+ MOVQ -8(R14)(CX*1), BP
+ MOVQ R15, (R9)
+ MOVQ BP, -8(R9)(CX*1)
+ ADDQ CX, R14
+ ADDQ CX, R9
+
+copy_5_end:
+ ADDQ CX, R11
+ SUBQ CX, R13
+
+ // Copy match from the current buffer
+copy_match:
+ MOVQ R9, CX
+ SUBQ R12, CX
+
+ // ml <= mo
+ CMPQ R13, R12
+ JA copy_overlapping_match
+
+ // Copy non-overlapping match
+ ADDQ R13, R11
+ MOVQ R13, R12
+ SUBQ $0x10, R12
+ JB copy_2_small
+
+copy_2_loop:
+ MOVUPS (CX), X0
+ MOVUPS X0, (R9)
+ ADDQ $0x10, CX
+ ADDQ $0x10, R9
+ SUBQ $0x10, R12
+ JAE copy_2_loop
+ LEAQ 16(CX)(R12*1), CX
+ LEAQ 16(R9)(R12*1), R9
+ MOVUPS -16(CX), X0
+ MOVUPS X0, -16(R9)
+ JMP copy_2_end
+
+copy_2_small:
+ CMPQ R13, $0x03
+ JE copy_2_move_3
+ JB copy_2_move_1or2
+ CMPQ R13, $0x08
+ JB copy_2_move_4through7
+ JMP copy_2_move_8through16
+
+copy_2_move_1or2:
+ MOVB (CX), R12
+ MOVB -1(CX)(R13*1), R14
+ MOVB R12, (R9)
+ MOVB R14, -1(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_3:
+ MOVW (CX), R12
+ MOVB 2(CX), R14
+ MOVW R12, (R9)
+ MOVB R14, 2(R9)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_4through7:
+ MOVL (CX), R12
+ MOVL -4(CX)(R13*1), R14
+ MOVL R12, (R9)
+ MOVL R14, -4(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+ JMP copy_2_end
+
+copy_2_move_8through16:
+ MOVQ (CX), R12
+ MOVQ -8(CX)(R13*1), R14
+ MOVQ R12, (R9)
+ MOVQ R14, -8(R9)(R13*1)
+ ADDQ R13, CX
+ ADDQ R13, R9
+
+copy_2_end:
+ JMP handle_loop
+
+ // Copy overlapping match
+copy_overlapping_match:
+ ADDQ R13, R11
+
+copy_slow_3:
+ MOVB (CX), R12
+ MOVB R12, (R9)
+ INCQ CX
+ INCQ R9
+ DECQ R13
+ JNZ copy_slow_3
+
+handle_loop:
+ MOVQ ctx+16(FP), CX
+ DECQ 96(CX)
+ JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
+
+loop_finished:
+ MOVQ br+8(FP), CX
+ MOVQ AX, 32(CX)
+ MOVB DL, 40(CX)
+ MOVQ BX, 24(CX)
+
+ // Update the context
+ MOVQ ctx+16(FP), AX
+ MOVQ R11, 136(AX)
+ MOVQ 144(AX), CX
+ SUBQ CX, R10
+ MOVQ R10, 168(AX)
+
+ // Return success
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+ // Return with match length error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
+ MOVQ 16(SP), AX
+ MOVQ ctx+16(FP), CX
+ MOVQ AX, 216(CX)
+ MOVQ $0x00000001, ret+24(FP)
+ RET
+
+ // Return with match too long error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ $0x00000002, ret+24(FP)
+ RET
+
+ // Return with match offset too long error
+error_match_off_too_big:
+ MOVQ ctx+16(FP), AX
+ MOVQ 8(SP), CX
+ MOVQ CX, 224(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000003, ret+24(FP)
+ RET
+
+ // Return with not enough literals error
+error_not_enough_literals:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ $0x00000004, ret+24(FP)
+ RET
+
+ // Return with not enough output space error
+error_not_enough_space:
+ MOVQ ctx+16(FP), AX
+ MOVQ 24(SP), CX
+ MOVQ CX, 208(AX)
+ MOVQ 16(SP), CX
+ MOVQ CX, 216(AX)
+ MOVQ R11, 136(AX)
+ MOVQ $0x00000005, ret+24(FP)
+ RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
new file mode 100644
index 0000000000..c3452bc3a9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -0,0 +1,237 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+ "fmt"
+ "io"
+)
+
+// decode sequences from the stream with the provided history but without dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+ return false, nil
+}
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+ br := s.br
+
+ // Grab full sizes tables, to avoid bounds checks.
+ llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+ llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+ s.seqSize = 0
+ litRemain := len(s.literals)
+
+ maxBlockSize := maxCompressedBlockSize
+ if s.windowSize < maxBlockSize {
+ maxBlockSize = s.windowSize
+ }
+ for i := range seqs {
+ var ll, mo, ml int
+ if br.off > 4+((maxOffsetBits+16+16)>>3) {
+ // inlined function:
+ // ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+ // Final will not read from stream.
+ var llB, mlB, moB uint8
+ ll, llB = llState.final()
+ ml, mlB = mlState.final()
+ mo, moB = ofState.final()
+
+ // extra bits are stored in reverse order.
+ br.fillFast()
+ mo += br.getBits(moB)
+ if s.maxBits > 32 {
+ br.fillFast()
+ }
+ ml += br.getBits(mlB)
+ ll += br.getBits(llB)
+
+ if moB > 1 {
+ s.prevOffset[2] = s.prevOffset[1]
+ s.prevOffset[1] = s.prevOffset[0]
+ s.prevOffset[0] = mo
+ } else {
+ // mo = s.adjustOffset(mo, ll, moB)
+ // Inlined for rather big speedup
+ if ll == 0 {
+ // There is an exception though, when current sequence's literals_length = 0.
+ // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+ // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+ mo++
+ }
+
+ if mo == 0 {
+ mo = s.prevOffset[0]
+ } else {
+ var temp int
+ if mo == 3 {
+ temp = s.prevOffset[0] - 1
+ } else {
+ temp = s.prevOffset[mo]
+ }
+
+ if temp == 0 {
+ // 0 is not valid; input is corrupted; force offset to 1
+ println("WARNING: temp was 0")
+ temp = 1
+ }
+
+ if mo != 1 {
+ s.prevOffset[2] = s.prevOffset[1]
+ }
+ s.prevOffset[1] = s.prevOffset[0]
+ s.prevOffset[0] = temp
+ mo = temp
+ }
+ }
+ br.fillFast()
+ } else {
+ if br.overread() {
+ if debugDecoder {
+ printf("reading sequence %d, exceeded available data\n", i)
+ }
+ return io.ErrUnexpectedEOF
+ }
+ ll, mo, ml = s.next(br, llState, mlState, ofState)
+ br.fill()
+ }
+
+ if debugSequences {
+ println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
+ }
+ // Evaluate.
+ // We might be doing this async, so do it early.
+ if mo == 0 && ml > 0 {
+ return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+ }
+ if ml > maxMatchLen {
+ return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+ }
+ s.seqSize += ll + ml
+ if s.seqSize > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ litRemain -= ll
+ if litRemain < 0 {
+ return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
+ }
+ seqs[i] = seqVals{
+ ll: ll,
+ ml: ml,
+ mo: mo,
+ }
+ if i == len(seqs)-1 {
+ // This is the last sequence, so we shouldn't update state.
+ break
+ }
+
+ // Manually inlined, ~ 5-20% faster
+ // Update all 3 states at once. Approx 20% faster.
+ nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+ if nBits == 0 {
+ llState = llTable[llState.newState()&maxTableMask]
+ mlState = mlTable[mlState.newState()&maxTableMask]
+ ofState = ofTable[ofState.newState()&maxTableMask]
+ } else {
+ bits := br.get32BitsFast(nBits)
+ lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+ llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+ lowBits = uint16(bits >> (ofState.nbBits() & 31))
+ lowBits &= bitMask[mlState.nbBits()&15]
+ mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+ lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+ ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+ }
+ }
+ s.seqSize += litRemain
+ if s.seqSize > maxBlockSize {
+ return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+ }
+ err := br.close()
+ if err != nil {
+ printf("Closing sequences: %v, %+v\n", err, *br)
+ }
+ return err
+}
+
+// executeSimple handles cases when a dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+ // Ensure we have enough output size...
+ if len(s.out)+s.seqSize > cap(s.out) {
+ addBytes := s.seqSize + len(s.out)
+ s.out = append(s.out, make([]byte, addBytes)...)
+ s.out = s.out[:len(s.out)-addBytes]
+ }
+
+ if debugDecoder {
+ printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+ }
+
+ var t = len(s.out)
+ out := s.out[:t+s.seqSize]
+
+ for _, seq := range seqs {
+ // Add literals
+ copy(out[t:], s.literals[:seq.ll])
+ t += seq.ll
+ s.literals = s.literals[seq.ll:]
+
+ // Malformed input
+ if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+ return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+ }
+
+ // Copy from history.
+ if v := seq.mo - t; v > 0 {
+ // v is the start position in history from end.
+ start := len(hist) - v
+ if seq.ml > v {
+ // Some goes into the current block.
+ // Copy remainder of history
+ copy(out[t:], hist[start:])
+ t += v
+ seq.ml -= v
+ } else {
+ copy(out[t:], hist[start:start+seq.ml])
+ t += seq.ml
+ continue
+ }
+ }
+
+ // We must be in the current buffer now
+ if seq.ml > 0 {
+ start := t - seq.mo
+ if seq.ml <= t-start {
+ // No overlap
+ copy(out[t:], out[start:start+seq.ml])
+ t += seq.ml
+ } else {
+ // Overlapping copy
+ // Extend destination slice and copy one byte at the time.
+ src := out[start : start+seq.ml]
+ dst := out[t:]
+ dst = dst[:len(src)]
+ t += len(src)
+ // Destination is the space we just added.
+ for i := range src {
+ dst[i] = src[i]
+ }
+ }
+ }
+ }
+ // Add final literals
+ copy(out[t:], s.literals)
+ if debugDecoder {
+ t += len(s.literals)
+ if t != len(out) {
+ panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+ }
+ }
+ s.out = out
+
+ return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
index ffffcbc254..29c15c8c4e 100644
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -18,26 +18,44 @@ const ZipMethodWinZip = 93
// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
const ZipMethodPKWare = 20
-var zipReaderPool sync.Pool
+// zipReaderPool is the default reader pool.
+var zipReaderPool = sync.Pool{New: func() interface{} {
+ z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
+ if err != nil {
+ panic(err)
+ }
+ return z
+}}
// newZipReader creates a pooled zip decompressor.
-func newZipReader(r io.Reader) io.ReadCloser {
- dec, ok := zipReaderPool.Get().(*Decoder)
- if ok {
- dec.Reset(r)
- } else {
- d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
- if err != nil {
- panic(err)
- }
- dec = d
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
+ pool := &zipReaderPool
+ if len(opts) > 0 {
+ opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
+ // Force concurrency 1
+ opts = append(opts, WithDecoderConcurrency(1))
+ // Create our own pool
+ pool = &sync.Pool{}
+ }
+ return func(r io.Reader) io.ReadCloser {
+ dec, ok := pool.Get().(*Decoder)
+ if ok {
+ dec.Reset(r)
+ } else {
+ d, err := NewReader(r, opts...)
+ if err != nil {
+ panic(err)
+ }
+ dec = d
+ }
+ return &pooledZipReader{dec: dec, pool: pool}
}
- return &pooledZipReader{dec: dec}
}
type pooledZipReader struct {
- mu sync.Mutex // guards Close and Read
- dec *Decoder
+ mu sync.Mutex // guards Close and Read
+ pool *sync.Pool
+ dec *Decoder
}
func (r *pooledZipReader) Read(p []byte) (n int, err error) {
@@ -48,8 +66,8 @@ func (r *pooledZipReader) Read(p []byte) (n int, err error) {
}
dec, err := r.dec.Read(p)
if err == io.EOF {
- err = r.dec.Reset(nil)
- zipReaderPool.Put(r.dec)
+ r.dec.Reset(nil)
+ r.pool.Put(r.dec)
r.dec = nil
}
return dec, err
@@ -61,7 +79,7 @@ func (r *pooledZipReader) Close() error {
var err error
if r.dec != nil {
err = r.dec.Reset(nil)
- zipReaderPool.Put(r.dec)
+ r.pool.Put(r.dec)
r.dec = nil
}
return err
@@ -115,6 +133,9 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
// ZipDecompressor returns a decompressor that can be registered with zip libraries.
// See ZipCompressor for example.
-func ZipDecompressor() func(r io.Reader) io.ReadCloser {
- return newZipReader
+// Options can be specified. WithDecoderConcurrency(1) is forced,
+// and by default a 128MB maximum decompression window is specified.
+// The window size can be overridden if required.
+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
+ return newZipReader(opts...)
}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index c1c90b4a07..3eb3f1c826 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -110,17 +110,6 @@ func printf(format string, a ...interface{}) {
}
}
-// matchLenFast does matching, but will not match the last up to 7 bytes.
-func matchLenFast(a, b []byte) int {
- endI := len(a) & (math.MaxInt32 - 7)
- for i := 0; i < endI; i += 8 {
- if diff := load64(a, i) ^ load64(b, i); diff != 0 {
- return i + bits.TrailingZeros64(diff)>>3
- }
- }
- return endI
-}
-
// matchLen returns the maximum length.
// a must be the shortest of the two.
// The function also returns whether all bytes matched.
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 680c3fc680..902216658f 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -454,11 +454,12 @@ github.com/ishidawataru/sctp
# github.com/jmespath/go-jmespath v0.3.0
## explicit; go 1.14
github.com/jmespath/go-jmespath
-# github.com/klauspost/compress v1.15.1
-## explicit; go 1.15
+# github.com/klauspost/compress v1.15.9
+## explicit; go 1.16
github.com/klauspost/compress
github.com/klauspost/compress/fse
github.com/klauspost/compress/huff0
+github.com/klauspost/compress/internal/cpuinfo
github.com/klauspost/compress/internal/snapref
github.com/klauspost/compress/zstd
github.com/klauspost/compress/zstd/internal/xxhash