forgejo/vendor/github.com/klauspost/crc32/crc32_s390x.s

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build s390x

#include "textflag.h"

// Vector register range containing CRC-32 constants

#define CONST_PERM_LE2BE        V9
#define CONST_R2R1              V10
#define CONST_R4R3              V11
#define CONST_R5                V12
#define CONST_RU_POLY           V13
#define CONST_CRC_POLY          V14

// The CRC-32 constant block contains reduction constants to fold and
// process particular chunks of the input data stream in parallel.
//
// Note that the constant definitions below are extended in order to compute
// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
// The rightmost doubleword can be 0 to prevent contribution to the result or
// can be multiplied by 1 to perform an XOR without the need for a separate
// VECTOR EXCLUSIVE OR instruction.
//
// The polynomials used are bit-reflected:
//
//            IEEE: P'(x) = 0x0edb88320
//      Castagnoli: P'(x) = 0x082f63b78

// IEEE polynomial constants
DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1

GLOBL ·crcleconskp(SB), RODATA, $144

// Castagonli Polynomial constants
DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1

GLOBL ·crccleconskp(SB), RODATA, $144

// func hasVectorFacility() bool
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
	MOVD  $x-24(SP), R1
	XC    $24, 0(R1), 0(R1) // clear the storage
	MOVD  $2, R0            // R0 is the number of double words stored -1
	WORD  $0xB2B01000       // STFLE 0(R1)
	XOR   R0, R0            // reset the value of R0
	MOVBZ z-8(SP), R1
	AND   $0x40, R1
	BEQ   novector

vectorinstalled:
	// check if the vector instruction has been enabled
	VLEIB  $0, $0xF, V16
	VLGVB  $0, V16, R1
	CMPBNE R1, $0xF, novector
	MOVB   $1, ret+0(FP)      // have vx
	RET

novector:
	MOVB $0, ret+0(FP) // no vx
	RET

// The CRC-32 function(s) use these calling conventions:
//
// Parameters:
//
//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
//      R3:    Input buffer pointer, performance might be improved if the
//             buffer is on a doubleword boundary.
//      R4:    Length of the buffer, must be 64 bytes or greater.
//
// Register usage:
//
//      R5:     CRC-32 constant pool base pointer.
//      V0:     Initial CRC value and intermediate constants and results.
//      V1..V4: Data for CRC computation.
//      V5..V8: Next data chunks that are fetched from the input buffer.
//
//      V9..V14: CRC-32 constants.

// func vectorizedIEEE(crc uint32, p []byte) uint32
TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
	MOVD  p+8(FP), R3      // data pointer
	MOVD  p_len+16(FP), R4 // len(p)

	MOVD $·crcleconskp(SB), R5
	BR   vectorizedBody<>(SB)

// func vectorizedCastagnoli(crc uint32, p []byte) uint32
TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
	MOVD  p+8(FP), R3      // data pointer
	MOVD  p_len+16(FP), R4 // len(p)

	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
	MOVD $·crccleconskp(SB), R5
	BR   vectorizedBody<>(SB)

TEXT vectorizedBody<>(SB), NOSPLIT, $0
	XOR $0xffffffff, R2                         // NOTW R2
	VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY

	// Load the initial CRC value into the rightmost word of V0
	VZERO V0
	VLVGF $3, R2, V0

	// Crash if the input size is less than 64-bytes.
	CMP R4, $64
	BLT crash

	// Load a 64-byte data chunk and XOR with CRC
	VLM 0(R3), V1, V4 // 64-bytes into V1..V4

	// Reflect the data if the CRC operation is in the bit-reflected domain
	VPERM V1, V1, CONST_PERM_LE2BE, V1
	VPERM V2, V2, CONST_PERM_LE2BE, V2
	VPERM V3, V3, CONST_PERM_LE2BE, V3
	VPERM V4, V4, CONST_PERM_LE2BE, V4

	VX  V0, V1, V1 // V1 ^= CRC
	ADD $64, R3    // BUF = BUF + 64
	ADD $(-64), R4

	// Check remaining buffer size and jump to proper folding method
	CMP R4, $64
	BLT less_than_64bytes

fold_64bytes_loop:
	// Load the next 64-byte data chunk into V5 to V8
	VLM   0(R3), V5, V8
	VPERM V5, V5, CONST_PERM_LE2BE, V5
	VPERM V6, V6, CONST_PERM_LE2BE, V6
	VPERM V7, V7, CONST_PERM_LE2BE, V7
	VPERM V8, V8, CONST_PERM_LE2BE, V8

	// Perform a GF(2) multiplication of the doublewords in V1 with
	// the reduction constants in V0.  The intermediate result is
	// then folded (accumulated) with the next data chunk in V5 and
	// stored in V1.  Repeat this step for the register contents
	// in V2, V3, and V4 respectively.

	VGFMAG CONST_R2R1, V1, V5, V1
	VGFMAG CONST_R2R1, V2, V6, V2
	VGFMAG CONST_R2R1, V3, V7, V3
	VGFMAG CONST_R2R1, V4, V8, V4

	// Adjust buffer pointer and length for next loop
	ADD $64, R3    // BUF = BUF + 64
	ADD $(-64), R4 // LEN = LEN - 64

	CMP R4, $64
	BGE fold_64bytes_loop

less_than_64bytes:
	// Fold V1 to V4 into a single 128-bit value in V1
	VGFMAG CONST_R4R3, V1, V2, V1
	VGFMAG CONST_R4R3, V1, V3, V1
	VGFMAG CONST_R4R3, V1, V4, V1

	// Check whether to continue with 64-bit folding
	CMP R4, $16
	BLT final_fold

fold_16bytes_loop:
	VL    0(R3), V2                    // Load next data chunk
	VPERM V2, V2, CONST_PERM_LE2BE, V2

	VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk

	// Adjust buffer pointer and size for folding next data chunk
	ADD $16, R3
	ADD $-16, R4

	// Process remaining data chunks
	CMP R4, $16
	BGE fold_16bytes_loop

final_fold:
	VLEIB $7, $0x40, V9
	VSRLB V9, CONST_R4R3, V0
	VLEIG $0, $1, V0

	VGFMG V0, V1, V1

	VLEIB  $7, $0x20, V9        // Shift by words
	VSRLB  V9, V1, V2           // Store remaining bits in V2
	VUPLLF V1, V1               // Split rightmost doubleword
	VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2

	// The input values to the Barret reduction are the degree-63 polynomial
	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
	// constant u.  The Barret reduction result is the CRC value of R(x) mod
	// P(x).
	//
	// The Barret reduction algorithm is defined as:
	//
	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
	//    3. C(x)  = R(x) XOR T2(x) mod x^32
	//
	// Note: To compensate the division by x^32, use the vector unpack
	// instruction to move the leftmost word into the leftmost doubleword
	// of the vector register.  The rightmost doubleword is multiplied
	// with zero to not contribute to the intermedate results.

	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
	VUPLLF V1, V2
	VGFMG  CONST_RU_POLY, V2, V2

	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
	// The final result is in the rightmost word of V2.

	VUPLLF V2, V2
	VGFMAG CONST_CRC_POLY, V2, V1, V2

done:
	VLGVF $2, V2, R2
	XOR   $0xffffffff, R2  // NOTW R2
	MOVWZ R2, ret + 32(FP)
	RET

crash:
	MOVD $0, (R0) // input size is less than 64-bytes
Added all required dependencies 2016-11-03 18:16:01 -04:00			`// Copyright 2016 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`// +build s390x`

			`#include "textflag.h"`

			`// Vector register range containing CRC-32 constants`

			`#define CONST_PERM_LE2BE V9`
			`#define CONST_R2R1 V10`
			`#define CONST_R4R3 V11`
			`#define CONST_R5 V12`
			`#define CONST_RU_POLY V13`
			`#define CONST_CRC_POLY V14`

			`// The CRC-32 constant block contains reduction constants to fold and`
			`// process particular chunks of the input data stream in parallel.`
			`//`
			`// Note that the constant definitions below are extended in order to compute`
			`// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.`
			`// The rightmost doubleword can be 0 to prevent contribution to the result or`
			`// can be multiplied by 1 to perform an XOR without the need for a separate`
			`// VECTOR EXCLUSIVE OR instruction.`
			`//`
			`// The polynomials used are bit-reflected:`
			`//`
			`// IEEE: P'(x) = 0x0edb88320`
			`// Castagnoli: P'(x) = 0x082f63b78`

			`// IEEE polynomial constants`
			`DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask`
			`DATA ·crcleconskp+8(SB)/8, $0x0706050403020100`
			`DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2`
			`DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1`
			`DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4`
			`DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3`
			`DATA ·crcleconskp+48(SB)/8, $0x0000000000000000`
			`DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5`
			`DATA ·crcleconskp+64(SB)/8, $0x0000000000000000`
			`DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'`
			`DATA ·crcleconskp+80(SB)/8, $0x0000000000000000`
			`DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1`

			`GLOBL ·crcleconskp(SB), RODATA, $144`

			`// Castagonli Polynomial constants`
			`DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask`
			`DATA ·crccleconskp+8(SB)/8, $0x0706050403020100`
			`DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2`
			`DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1`
			`DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4`
			`DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3`
			`DATA ·crccleconskp+48(SB)/8, $0x0000000000000000`
			`DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5`
			`DATA ·crccleconskp+64(SB)/8, $0x0000000000000000`
			`DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'`
			`DATA ·crccleconskp+80(SB)/8, $0x0000000000000000`
			`DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1`

			`GLOBL ·crccleconskp(SB), RODATA, $144`

			`// func hasVectorFacility() bool`
			`TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1`
			`MOVD $x-24(SP), R1`
			`XC $24, 0(R1), 0(R1) // clear the storage`
			`MOVD $2, R0 // R0 is the number of double words stored -1`
			`WORD $0xB2B01000 // STFLE 0(R1)`
			`XOR R0, R0 // reset the value of R0`
			`MOVBZ z-8(SP), R1`
			`AND $0x40, R1`
			`BEQ novector`

			`vectorinstalled:`
			`// check if the vector instruction has been enabled`
			`VLEIB $0, $0xF, V16`
			`VLGVB $0, V16, R1`
			`CMPBNE R1, $0xF, novector`
			`MOVB $1, ret+0(FP) // have vx`
			`RET`

			`novector:`
			`MOVB $0, ret+0(FP) // no vx`
			`RET`

			`// The CRC-32 function(s) use these calling conventions:`
			`//`
			`// Parameters:`
			`//`
			`// R2: Initial CRC value, typically ~0; and final CRC (return) value.`
			`// R3: Input buffer pointer, performance might be improved if the`
			`// buffer is on a doubleword boundary.`
			`// R4: Length of the buffer, must be 64 bytes or greater.`
			`//`
			`// Register usage:`
			`//`
			`// R5: CRC-32 constant pool base pointer.`
			`// V0: Initial CRC value and intermediate constants and results.`
			`// V1..V4: Data for CRC computation.`
			`// V5..V8: Next data chunks that are fetched from the input buffer.`
			`//`
			`// V9..V14: CRC-32 constants.`

			`// func vectorizedIEEE(crc uint32, p []byte) uint32`
			`TEXT ·vectorizedIEEE(SB), NOSPLIT, $0`
			`MOVWZ crc+0(FP), R2 // R2 stores the CRC value`
			`MOVD p+8(FP), R3 // data pointer`
			`MOVD p_len+16(FP), R4 // len(p)`

			`MOVD $·crcleconskp(SB), R5`
			`BR vectorizedBody<>(SB)`

			`// func vectorizedCastagnoli(crc uint32, p []byte) uint32`
			`TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0`
			`MOVWZ crc+0(FP), R2 // R2 stores the CRC value`
			`MOVD p+8(FP), R3 // data pointer`
			`MOVD p_len+16(FP), R4 // len(p)`

			`// R5: crc-32 constant pool base pointer, constant is used to reduce crc`
			`MOVD $·crccleconskp(SB), R5`
			`BR vectorizedBody<>(SB)`

			`TEXT vectorizedBody<>(SB), NOSPLIT, $0`
			`XOR $0xffffffff, R2 // NOTW R2`
			`VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY`

			`// Load the initial CRC value into the rightmost word of V0`
			`VZERO V0`
			`VLVGF $3, R2, V0`

			`// Crash if the input size is less than 64-bytes.`
			`CMP R4, $64`
			`BLT crash`

			`// Load a 64-byte data chunk and XOR with CRC`
			`VLM 0(R3), V1, V4 // 64-bytes into V1..V4`

			`// Reflect the data if the CRC operation is in the bit-reflected domain`
			`VPERM V1, V1, CONST_PERM_LE2BE, V1`
			`VPERM V2, V2, CONST_PERM_LE2BE, V2`
			`VPERM V3, V3, CONST_PERM_LE2BE, V3`
			`VPERM V4, V4, CONST_PERM_LE2BE, V4`

			`VX V0, V1, V1 // V1 ^= CRC`
			`ADD $64, R3 // BUF = BUF + 64`
			`ADD $(-64), R4`

			`// Check remaining buffer size and jump to proper folding method`
			`CMP R4, $64`
			`BLT less_than_64bytes`

			`fold_64bytes_loop:`
			`// Load the next 64-byte data chunk into V5 to V8`
			`VLM 0(R3), V5, V8`
			`VPERM V5, V5, CONST_PERM_LE2BE, V5`
			`VPERM V6, V6, CONST_PERM_LE2BE, V6`
			`VPERM V7, V7, CONST_PERM_LE2BE, V7`
			`VPERM V8, V8, CONST_PERM_LE2BE, V8`

			`// Perform a GF(2) multiplication of the doublewords in V1 with`
			`// the reduction constants in V0. The intermediate result is`
			`// then folded (accumulated) with the next data chunk in V5 and`
			`// stored in V1. Repeat this step for the register contents`
			`// in V2, V3, and V4 respectively.`

			`VGFMAG CONST_R2R1, V1, V5, V1`
			`VGFMAG CONST_R2R1, V2, V6, V2`
			`VGFMAG CONST_R2R1, V3, V7, V3`
			`VGFMAG CONST_R2R1, V4, V8, V4`

			`// Adjust buffer pointer and length for next loop`
			`ADD $64, R3 // BUF = BUF + 64`
			`ADD $(-64), R4 // LEN = LEN - 64`

			`CMP R4, $64`
			`BGE fold_64bytes_loop`

			`less_than_64bytes:`
			`// Fold V1 to V4 into a single 128-bit value in V1`
			`VGFMAG CONST_R4R3, V1, V2, V1`
			`VGFMAG CONST_R4R3, V1, V3, V1`
			`VGFMAG CONST_R4R3, V1, V4, V1`

			`// Check whether to continue with 64-bit folding`
			`CMP R4, $16`
			`BLT final_fold`

			`fold_16bytes_loop:`
			`VL 0(R3), V2 // Load next data chunk`
			`VPERM V2, V2, CONST_PERM_LE2BE, V2`

			`VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk`

			`// Adjust buffer pointer and size for folding next data chunk`
			`ADD $16, R3`
			`ADD $-16, R4`

			`// Process remaining data chunks`
			`CMP R4, $16`
			`BGE fold_16bytes_loop`

			`final_fold:`
			`VLEIB $7, $0x40, V9`
			`VSRLB V9, CONST_R4R3, V0`
			`VLEIG $0, $1, V0`

			`VGFMG V0, V1, V1`

			`VLEIB $7, $0x20, V9 // Shift by words`
			`VSRLB V9, V1, V2 // Store remaining bits in V2`
			`VUPLLF V1, V1 // Split rightmost doubleword`
			`VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2`

			`// The input values to the Barret reduction are the degree-63 polynomial`
			`// in V1 (R(x)), degree-32 generator polynomial, and the reduction`
			`// constant u. The Barret reduction result is the CRC value of R(x) mod`
			`// P(x).`
			`//`
			`// The Barret reduction algorithm is defined as:`
			`//`
			`// 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u`
			`// 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)`
			`// 3. C(x) = R(x) XOR T2(x) mod x^32`
			`//`
			`// Note: To compensate the division by x^32, use the vector unpack`
			`// instruction to move the leftmost word into the leftmost doubleword`
			`// of the vector register. The rightmost doubleword is multiplied`
			`// with zero to not contribute to the intermedate results.`

			`// T1(x) = floor( R(x) / x^32 ) GF2MUL u`
			`VUPLLF V1, V2`
			`VGFMG CONST_RU_POLY, V2, V2`

			`// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in`
			`// V2 and XOR the intermediate result, T2(x), with the value in V1.`
			`// The final result is in the rightmost word of V2.`

			`VUPLLF V2, V2`
			`VGFMAG CONST_CRC_POLY, V2, V1, V2`

			`done:`
			`VLGVF $2, V2, R2`
			`XOR $0xffffffff, R2 // NOTW R2`
			`MOVWZ R2, ret + 32(FP)`
			`RET`

			`crash:`
			`MOVD $0, (R0) // input size is less than 64-bytes`