forked from toolshed/abra
		
	We were running behind and there were quite some deprecations to update. This was mostly in the upstream copy/pasta package but seems quite minimal.
		
			
				
	
	
		
			4152 lines
		
	
	
		
			82 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			4152 lines
		
	
	
		
			82 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
 | 
						|
 | 
						|
//go:build !appengine && !noasm && gc && !noasm
 | 
						|
 | 
						|
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 | 
						|
// Requires: CMOV
 | 
						|
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
 | 
						|
	MOVQ    br+8(FP), CX
 | 
						|
	MOVQ    24(CX), DX
 | 
						|
	MOVBQZX 40(CX), BX
 | 
						|
	MOVQ    (CX), AX
 | 
						|
	MOVQ    32(CX), SI
 | 
						|
	ADDQ    SI, AX
 | 
						|
	MOVQ    AX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), AX
 | 
						|
	MOVQ    72(AX), DI
 | 
						|
	MOVQ    80(AX), R8
 | 
						|
	MOVQ    88(AX), R9
 | 
						|
	MOVQ    104(AX), R10
 | 
						|
	MOVQ    s+0(FP), AX
 | 
						|
	MOVQ    144(AX), R11
 | 
						|
	MOVQ    152(AX), R12
 | 
						|
	MOVQ    160(AX), R13
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_main_loop:
 | 
						|
	MOVQ (SP), R14
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ SI, $0x08
 | 
						|
	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
 | 
						|
	MOVQ BX, AX
 | 
						|
	SHRQ $0x03, AX
 | 
						|
	SUBQ AX, R14
 | 
						|
	MOVQ (R14), DX
 | 
						|
	SUBQ AX, SI
 | 
						|
	ANDQ $0x07, BX
 | 
						|
	JMP  sequenceDecs_decode_amd64_fill_end
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_fill_byte_by_byte:
 | 
						|
	CMPQ    SI, $0x00
 | 
						|
	JLE     sequenceDecs_decode_amd64_fill_check_overread
 | 
						|
	CMPQ    BX, $0x07
 | 
						|
	JLE     sequenceDecs_decode_amd64_fill_end
 | 
						|
	SHLQ    $0x08, DX
 | 
						|
	SUBQ    $0x01, R14
 | 
						|
	SUBQ    $0x01, SI
 | 
						|
	SUBQ    $0x08, BX
 | 
						|
	MOVBQZX (R14), AX
 | 
						|
	ORQ     AX, DX
 | 
						|
	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_fill_check_overread:
 | 
						|
	CMPQ BX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ  R9, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R15
 | 
						|
	SHLQ  CL, R15
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decode_amd64_of_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decode_amd64_of_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decode_amd64_of_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R15
 | 
						|
	ADDQ  R15, AX
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_of_update_zero:
 | 
						|
	MOVQ AX, 16(R10)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ  R8, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R15
 | 
						|
	SHLQ  CL, R15
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decode_amd64_ml_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decode_amd64_ml_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decode_amd64_ml_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R15
 | 
						|
	ADDQ  R15, AX
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_ml_update_zero:
 | 
						|
	MOVQ AX, 8(R10)
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the remaining
 | 
						|
	CMPQ SI, $0x08
 | 
						|
	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
 | 
						|
	MOVQ BX, AX
 | 
						|
	SHRQ $0x03, AX
 | 
						|
	SUBQ AX, R14
 | 
						|
	MOVQ (R14), DX
 | 
						|
	SUBQ AX, SI
 | 
						|
	ANDQ $0x07, BX
 | 
						|
	JMP  sequenceDecs_decode_amd64_fill_2_end
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_fill_2_byte_by_byte:
 | 
						|
	CMPQ    SI, $0x00
 | 
						|
	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
 | 
						|
	CMPQ    BX, $0x07
 | 
						|
	JLE     sequenceDecs_decode_amd64_fill_2_end
 | 
						|
	SHLQ    $0x08, DX
 | 
						|
	SUBQ    $0x01, R14
 | 
						|
	SUBQ    $0x01, SI
 | 
						|
	SUBQ    $0x08, BX
 | 
						|
	MOVBQZX (R14), AX
 | 
						|
	ORQ     AX, DX
 | 
						|
	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_fill_2_check_overread:
 | 
						|
	CMPQ BX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_fill_2_end:
 | 
						|
	// Update literal length
 | 
						|
	MOVQ  DI, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R15
 | 
						|
	SHLQ  CL, R15
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decode_amd64_ll_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decode_amd64_ll_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decode_amd64_ll_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R15
 | 
						|
	ADDQ  R15, AX
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_ll_update_zero:
 | 
						|
	MOVQ AX, (R10)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R14, (SP)
 | 
						|
	MOVQ    R9, AX
 | 
						|
	SHRQ    $0x08, AX
 | 
						|
	MOVBQZX AL, AX
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decode_amd64_skip_update
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	MOVBQZX DI, R14
 | 
						|
	SHRL    $0x10, DI
 | 
						|
	LEAQ    (BX)(R14*1), CX
 | 
						|
	MOVQ    DX, R15
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	MOVL    $0x00000001, BP
 | 
						|
	MOVB    R14, CL
 | 
						|
	SHLL    CL, BP
 | 
						|
	DECL    BP
 | 
						|
	ANDQ    BP, R15
 | 
						|
	ADDQ    R15, DI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	MOVBQZX R8, R14
 | 
						|
	SHRL    $0x10, R8
 | 
						|
	LEAQ    (BX)(R14*1), CX
 | 
						|
	MOVQ    DX, R15
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	MOVL    $0x00000001, BP
 | 
						|
	MOVB    R14, CL
 | 
						|
	SHLL    CL, BP
 | 
						|
	DECL    BP
 | 
						|
	ANDQ    BP, R15
 | 
						|
	ADDQ    R15, R8
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	MOVBQZX R9, R14
 | 
						|
	SHRL    $0x10, R9
 | 
						|
	LEAQ    (BX)(R14*1), CX
 | 
						|
	MOVQ    DX, R15
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	MOVL    $0x00000001, BP
 | 
						|
	MOVB    R14, CL
 | 
						|
	SHLL    CL, BP
 | 
						|
	DECL    BP
 | 
						|
	ANDQ    BP, R15
 | 
						|
	ADDQ    R15, R9
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R9*8), R9
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ 16(R10), CX
 | 
						|
	CMPQ AX, $0x01
 | 
						|
	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
 | 
						|
	MOVQ R12, R13
 | 
						|
	MOVQ R11, R12
 | 
						|
	MOVQ CX, R11
 | 
						|
	JMP  sequenceDecs_decode_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ (R10), $0x00000000
 | 
						|
	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
 | 
						|
	INCQ CX
 | 
						|
	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_offset_maybezero:
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
 | 
						|
	MOVQ  R11, CX
 | 
						|
	JMP   sequenceDecs_decode_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_offset_nonzero:
 | 
						|
	CMPQ CX, $0x01
 | 
						|
	JB   sequenceDecs_decode_amd64_adjust_zero
 | 
						|
	JEQ  sequenceDecs_decode_amd64_adjust_one
 | 
						|
	CMPQ CX, $0x02
 | 
						|
	JA   sequenceDecs_decode_amd64_adjust_three
 | 
						|
	JMP  sequenceDecs_decode_amd64_adjust_two
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_zero:
 | 
						|
	MOVQ R11, AX
 | 
						|
	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_one:
 | 
						|
	MOVQ R12, AX
 | 
						|
	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_two:
 | 
						|
	MOVQ R13, AX
 | 
						|
	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_three:
 | 
						|
	LEAQ -1(R11), AX
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_test_temp_valid:
 | 
						|
	TESTQ AX, AX
 | 
						|
	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
 | 
						|
	MOVQ  $0x00000001, AX
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_adjust_temp_valid:
 | 
						|
	CMPQ    CX, $0x01
 | 
						|
	CMOVQNE R12, R13
 | 
						|
	MOVQ    R11, R12
 | 
						|
	MOVQ    AX, R11
 | 
						|
	MOVQ    AX, CX
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_after_adjust:
 | 
						|
	MOVQ CX, 16(R10)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  8(R10), AX
 | 
						|
	MOVQ  (R10), R14
 | 
						|
	LEAQ  (AX)(R14*1), R15
 | 
						|
	MOVQ  s+0(FP), BP
 | 
						|
	ADDQ  R15, 256(BP)
 | 
						|
	MOVQ  ctx+16(FP), R15
 | 
						|
	SUBQ  R14, 128(R15)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  AX, $0x00020002
 | 
						|
	JA    sequenceDecs_decode_amd64_error_match_len_too_big
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
 | 
						|
	TESTQ AX, AX
 | 
						|
	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decode_amd64_match_len_ofs_ok:
 | 
						|
	ADDQ $0x18, R10
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	DECQ 96(AX)
 | 
						|
	JNS  sequenceDecs_decode_amd64_main_loop
 | 
						|
	MOVQ s+0(FP), AX
 | 
						|
	MOVQ R11, 144(AX)
 | 
						|
	MOVQ R12, 152(AX)
 | 
						|
	MOVQ R13, 160(AX)
 | 
						|
	MOVQ br+8(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVB BL, 40(AX)
 | 
						|
	MOVQ SI, 32(AX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decode_amd64_error_match_len_too_big:
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 | 
						|
// Requires: CMOV
 | 
						|
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
 | 
						|
	MOVQ    br+8(FP), CX
 | 
						|
	MOVQ    24(CX), DX
 | 
						|
	MOVBQZX 40(CX), BX
 | 
						|
	MOVQ    (CX), AX
 | 
						|
	MOVQ    32(CX), SI
 | 
						|
	ADDQ    SI, AX
 | 
						|
	MOVQ    AX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), AX
 | 
						|
	MOVQ    72(AX), DI
 | 
						|
	MOVQ    80(AX), R8
 | 
						|
	MOVQ    88(AX), R9
 | 
						|
	MOVQ    104(AX), R10
 | 
						|
	MOVQ    s+0(FP), AX
 | 
						|
	MOVQ    144(AX), R11
 | 
						|
	MOVQ    152(AX), R12
 | 
						|
	MOVQ    160(AX), R13
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_main_loop:
 | 
						|
	MOVQ (SP), R14
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ SI, $0x08
 | 
						|
	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
 | 
						|
	MOVQ BX, AX
 | 
						|
	SHRQ $0x03, AX
 | 
						|
	SUBQ AX, R14
 | 
						|
	MOVQ (R14), DX
 | 
						|
	SUBQ AX, SI
 | 
						|
	ANDQ $0x07, BX
 | 
						|
	JMP  sequenceDecs_decode_56_amd64_fill_end
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_fill_byte_by_byte:
 | 
						|
	CMPQ    SI, $0x00
 | 
						|
	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
 | 
						|
	CMPQ    BX, $0x07
 | 
						|
	JLE     sequenceDecs_decode_56_amd64_fill_end
 | 
						|
	SHLQ    $0x08, DX
 | 
						|
	SUBQ    $0x01, R14
 | 
						|
	SUBQ    $0x01, SI
 | 
						|
	SUBQ    $0x08, BX
 | 
						|
	MOVBQZX (R14), AX
 | 
						|
	ORQ     AX, DX
 | 
						|
	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_fill_check_overread:
 | 
						|
	CMPQ BX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ  R9, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R15
 | 
						|
	SHLQ  CL, R15
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decode_56_amd64_of_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decode_56_amd64_of_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decode_56_amd64_of_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R15
 | 
						|
	ADDQ  R15, AX
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_of_update_zero:
 | 
						|
	MOVQ AX, 16(R10)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ  R8, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R15
 | 
						|
	SHLQ  CL, R15
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decode_56_amd64_ml_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R15
 | 
						|
	ADDQ  R15, AX
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_ml_update_zero:
 | 
						|
	MOVQ AX, 8(R10)
 | 
						|
 | 
						|
	// Update literal length
 | 
						|
	MOVQ  DI, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R15
 | 
						|
	SHLQ  CL, R15
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decode_56_amd64_ll_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R15
 | 
						|
	ADDQ  R15, AX
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_ll_update_zero:
 | 
						|
	MOVQ AX, (R10)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R14, (SP)
 | 
						|
	MOVQ    R9, AX
 | 
						|
	SHRQ    $0x08, AX
 | 
						|
	MOVBQZX AL, AX
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decode_56_amd64_skip_update
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	MOVBQZX DI, R14
 | 
						|
	SHRL    $0x10, DI
 | 
						|
	LEAQ    (BX)(R14*1), CX
 | 
						|
	MOVQ    DX, R15
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	MOVL    $0x00000001, BP
 | 
						|
	MOVB    R14, CL
 | 
						|
	SHLL    CL, BP
 | 
						|
	DECL    BP
 | 
						|
	ANDQ    BP, R15
 | 
						|
	ADDQ    R15, DI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	MOVBQZX R8, R14
 | 
						|
	SHRL    $0x10, R8
 | 
						|
	LEAQ    (BX)(R14*1), CX
 | 
						|
	MOVQ    DX, R15
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	MOVL    $0x00000001, BP
 | 
						|
	MOVB    R14, CL
 | 
						|
	SHLL    CL, BP
 | 
						|
	DECL    BP
 | 
						|
	ANDQ    BP, R15
 | 
						|
	ADDQ    R15, R8
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	MOVBQZX R9, R14
 | 
						|
	SHRL    $0x10, R9
 | 
						|
	LEAQ    (BX)(R14*1), CX
 | 
						|
	MOVQ    DX, R15
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	MOVL    $0x00000001, BP
 | 
						|
	MOVB    R14, CL
 | 
						|
	SHLL    CL, BP
 | 
						|
	DECL    BP
 | 
						|
	ANDQ    BP, R15
 | 
						|
	ADDQ    R15, R9
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R9*8), R9
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ 16(R10), CX
 | 
						|
	CMPQ AX, $0x01
 | 
						|
	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
 | 
						|
	MOVQ R12, R13
 | 
						|
	MOVQ R11, R12
 | 
						|
	MOVQ CX, R11
 | 
						|
	JMP  sequenceDecs_decode_56_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ (R10), $0x00000000
 | 
						|
	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
 | 
						|
	INCQ CX
 | 
						|
	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
 | 
						|
	MOVQ  R11, CX
 | 
						|
	JMP   sequenceDecs_decode_56_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
 | 
						|
	CMPQ CX, $0x01
 | 
						|
	JB   sequenceDecs_decode_56_amd64_adjust_zero
 | 
						|
	JEQ  sequenceDecs_decode_56_amd64_adjust_one
 | 
						|
	CMPQ CX, $0x02
 | 
						|
	JA   sequenceDecs_decode_56_amd64_adjust_three
 | 
						|
	JMP  sequenceDecs_decode_56_amd64_adjust_two
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_zero:
 | 
						|
	MOVQ R11, AX
 | 
						|
	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_one:
 | 
						|
	MOVQ R12, AX
 | 
						|
	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_two:
 | 
						|
	MOVQ R13, AX
 | 
						|
	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_three:
 | 
						|
	LEAQ -1(R11), AX
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
 | 
						|
	TESTQ AX, AX
 | 
						|
	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
 | 
						|
	MOVQ  $0x00000001, AX
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_adjust_temp_valid:
 | 
						|
	CMPQ    CX, $0x01
 | 
						|
	CMOVQNE R12, R13
 | 
						|
	MOVQ    R11, R12
 | 
						|
	MOVQ    AX, R11
 | 
						|
	MOVQ    AX, CX
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_after_adjust:
 | 
						|
	MOVQ CX, 16(R10)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  8(R10), AX
 | 
						|
	MOVQ  (R10), R14
 | 
						|
	LEAQ  (AX)(R14*1), R15
 | 
						|
	MOVQ  s+0(FP), BP
 | 
						|
	ADDQ  R15, 256(BP)
 | 
						|
	MOVQ  ctx+16(FP), R15
 | 
						|
	SUBQ  R14, 128(R15)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  AX, $0x00020002
 | 
						|
	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
 | 
						|
	TESTQ AX, AX
 | 
						|
	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decode_56_amd64_match_len_ofs_ok:
 | 
						|
	ADDQ $0x18, R10
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	DECQ 96(AX)
 | 
						|
	JNS  sequenceDecs_decode_56_amd64_main_loop
 | 
						|
	MOVQ s+0(FP), AX
 | 
						|
	MOVQ R11, 144(AX)
 | 
						|
	MOVQ R12, 152(AX)
 | 
						|
	MOVQ R13, 160(AX)
 | 
						|
	MOVQ br+8(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVB BL, 40(AX)
 | 
						|
	MOVQ SI, 32(AX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decode_56_amd64_error_match_len_too_big:
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 | 
						|
// Requires: BMI, BMI2, CMOV
 | 
						|
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
 | 
						|
	MOVQ    br+8(FP), BX
 | 
						|
	MOVQ    24(BX), AX
 | 
						|
	MOVBQZX 40(BX), DX
 | 
						|
	MOVQ    (BX), CX
 | 
						|
	MOVQ    32(BX), BX
 | 
						|
	ADDQ    BX, CX
 | 
						|
	MOVQ    CX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	MOVQ    72(CX), SI
 | 
						|
	MOVQ    80(CX), DI
 | 
						|
	MOVQ    88(CX), R8
 | 
						|
	MOVQ    104(CX), R9
 | 
						|
	MOVQ    s+0(FP), CX
 | 
						|
	MOVQ    144(CX), R10
 | 
						|
	MOVQ    152(CX), R11
 | 
						|
	MOVQ    160(CX), R12
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_main_loop:
 | 
						|
	MOVQ (SP), R13
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ BX, $0x08
 | 
						|
	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
 | 
						|
	MOVQ DX, CX
 | 
						|
	SHRQ $0x03, CX
 | 
						|
	SUBQ CX, R13
 | 
						|
	MOVQ (R13), AX
 | 
						|
	SUBQ CX, BX
 | 
						|
	ANDQ $0x07, DX
 | 
						|
	JMP  sequenceDecs_decode_bmi2_fill_end
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_fill_byte_by_byte:
 | 
						|
	CMPQ    BX, $0x00
 | 
						|
	JLE     sequenceDecs_decode_bmi2_fill_check_overread
 | 
						|
	CMPQ    DX, $0x07
 | 
						|
	JLE     sequenceDecs_decode_bmi2_fill_end
 | 
						|
	SHLQ    $0x08, AX
 | 
						|
	SUBQ    $0x01, R13
 | 
						|
	SUBQ    $0x01, BX
 | 
						|
	SUBQ    $0x08, DX
 | 
						|
	MOVBQZX (R13), CX
 | 
						|
	ORQ     CX, AX
 | 
						|
	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_fill_check_overread:
 | 
						|
	CMPQ DX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, R8, R14
 | 
						|
	MOVQ   AX, R15
 | 
						|
	LEAQ   (DX)(R14*1), CX
 | 
						|
	ROLQ   CL, R15
 | 
						|
	BZHIQ  R14, R15, R15
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   R8, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R15, CX
 | 
						|
	MOVQ   CX, 16(R9)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, DI, R14
 | 
						|
	MOVQ   AX, R15
 | 
						|
	LEAQ   (DX)(R14*1), CX
 | 
						|
	ROLQ   CL, R15
 | 
						|
	BZHIQ  R14, R15, R15
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   DI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R15, CX
 | 
						|
	MOVQ   CX, 8(R9)
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the remaining
 | 
						|
	CMPQ BX, $0x08
 | 
						|
	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
 | 
						|
	MOVQ DX, CX
 | 
						|
	SHRQ $0x03, CX
 | 
						|
	SUBQ CX, R13
 | 
						|
	MOVQ (R13), AX
 | 
						|
	SUBQ CX, BX
 | 
						|
	ANDQ $0x07, DX
 | 
						|
	JMP  sequenceDecs_decode_bmi2_fill_2_end
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
 | 
						|
	CMPQ    BX, $0x00
 | 
						|
	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
 | 
						|
	CMPQ    DX, $0x07
 | 
						|
	JLE     sequenceDecs_decode_bmi2_fill_2_end
 | 
						|
	SHLQ    $0x08, AX
 | 
						|
	SUBQ    $0x01, R13
 | 
						|
	SUBQ    $0x01, BX
 | 
						|
	SUBQ    $0x08, DX
 | 
						|
	MOVBQZX (R13), CX
 | 
						|
	ORQ     CX, AX
 | 
						|
	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_fill_2_check_overread:
 | 
						|
	CMPQ DX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_fill_2_end:
 | 
						|
	// Update literal length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, SI, R14
 | 
						|
	MOVQ   AX, R15
 | 
						|
	LEAQ   (DX)(R14*1), CX
 | 
						|
	ROLQ   CL, R15
 | 
						|
	BZHIQ  R14, R15, R15
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   SI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R15, CX
 | 
						|
	MOVQ   CX, (R9)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R13, (SP)
 | 
						|
	MOVQ    $0x00000808, CX
 | 
						|
	BEXTRQ  CX, R8, R13
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decode_bmi2_skip_update
 | 
						|
	LEAQ    (SI)(DI*1), R14
 | 
						|
	ADDQ    R8, R14
 | 
						|
	MOVBQZX R14, R14
 | 
						|
	LEAQ    (DX)(R14*1), CX
 | 
						|
	MOVQ    AX, R15
 | 
						|
	MOVQ    CX, DX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	BZHIQ   R14, R15, R15
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	BZHIQ R8, R15, CX
 | 
						|
	SHRXQ R8, R15, R15
 | 
						|
	SHRL  $0x10, R8
 | 
						|
	ADDQ  CX, R8
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	BZHIQ DI, R15, CX
 | 
						|
	SHRXQ DI, R15, R15
 | 
						|
	SHRL  $0x10, DI
 | 
						|
	ADDQ  CX, DI
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	BZHIQ SI, R15, CX
 | 
						|
	SHRL  $0x10, SI
 | 
						|
	ADDQ  CX, SI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(SI*8), SI
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ 16(R9), CX
 | 
						|
	CMPQ R13, $0x01
 | 
						|
	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
 | 
						|
	MOVQ R11, R12
 | 
						|
	MOVQ R10, R11
 | 
						|
	MOVQ CX, R10
 | 
						|
	JMP  sequenceDecs_decode_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ (R9), $0x00000000
 | 
						|
	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
 | 
						|
	INCQ CX
 | 
						|
	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_offset_maybezero:
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
 | 
						|
	MOVQ  R10, CX
 | 
						|
	JMP   sequenceDecs_decode_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
 | 
						|
	CMPQ CX, $0x01
 | 
						|
	JB   sequenceDecs_decode_bmi2_adjust_zero
 | 
						|
	JEQ  sequenceDecs_decode_bmi2_adjust_one
 | 
						|
	CMPQ CX, $0x02
 | 
						|
	JA   sequenceDecs_decode_bmi2_adjust_three
 | 
						|
	JMP  sequenceDecs_decode_bmi2_adjust_two
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_zero:
 | 
						|
	MOVQ R10, R13
 | 
						|
	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_one:
 | 
						|
	MOVQ R11, R13
 | 
						|
	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_two:
 | 
						|
	MOVQ R12, R13
 | 
						|
	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_three:
 | 
						|
	LEAQ -1(R10), R13
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_test_temp_valid:
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
 | 
						|
	MOVQ  $0x00000001, R13
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_adjust_temp_valid:
 | 
						|
	CMPQ    CX, $0x01
 | 
						|
	CMOVQNE R11, R12
 | 
						|
	MOVQ    R10, R11
 | 
						|
	MOVQ    R13, R10
 | 
						|
	MOVQ    R13, CX
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_after_adjust:
 | 
						|
	MOVQ CX, 16(R9)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  8(R9), R13
 | 
						|
	MOVQ  (R9), R14
 | 
						|
	LEAQ  (R13)(R14*1), R15
 | 
						|
	MOVQ  s+0(FP), BP
 | 
						|
	ADDQ  R15, 256(BP)
 | 
						|
	MOVQ  ctx+16(FP), R15
 | 
						|
	SUBQ  R14, 128(R15)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  R13, $0x00020002
 | 
						|
	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decode_bmi2_match_len_ofs_ok:
 | 
						|
	ADDQ $0x18, R9
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	DECQ 96(CX)
 | 
						|
	JNS  sequenceDecs_decode_bmi2_main_loop
 | 
						|
	MOVQ s+0(FP), CX
 | 
						|
	MOVQ R10, 144(CX)
 | 
						|
	MOVQ R11, 152(CX)
 | 
						|
	MOVQ R12, 160(CX)
 | 
						|
	MOVQ br+8(FP), CX
 | 
						|
	MOVQ AX, 24(CX)
 | 
						|
	MOVB DL, 40(CX)
 | 
						|
	MOVQ BX, 32(CX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decode_bmi2_error_match_len_too_big:
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 | 
						|
// Requires: BMI, BMI2, CMOV
 | 
						|
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
 | 
						|
	MOVQ    br+8(FP), BX
 | 
						|
	MOVQ    24(BX), AX
 | 
						|
	MOVBQZX 40(BX), DX
 | 
						|
	MOVQ    (BX), CX
 | 
						|
	MOVQ    32(BX), BX
 | 
						|
	ADDQ    BX, CX
 | 
						|
	MOVQ    CX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	MOVQ    72(CX), SI
 | 
						|
	MOVQ    80(CX), DI
 | 
						|
	MOVQ    88(CX), R8
 | 
						|
	MOVQ    104(CX), R9
 | 
						|
	MOVQ    s+0(FP), CX
 | 
						|
	MOVQ    144(CX), R10
 | 
						|
	MOVQ    152(CX), R11
 | 
						|
	MOVQ    160(CX), R12
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_main_loop:
 | 
						|
	MOVQ (SP), R13
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ BX, $0x08
 | 
						|
	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
 | 
						|
	MOVQ DX, CX
 | 
						|
	SHRQ $0x03, CX
 | 
						|
	SUBQ CX, R13
 | 
						|
	MOVQ (R13), AX
 | 
						|
	SUBQ CX, BX
 | 
						|
	ANDQ $0x07, DX
 | 
						|
	JMP  sequenceDecs_decode_56_bmi2_fill_end
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
 | 
						|
	CMPQ    BX, $0x00
 | 
						|
	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
 | 
						|
	CMPQ    DX, $0x07
 | 
						|
	JLE     sequenceDecs_decode_56_bmi2_fill_end
 | 
						|
	SHLQ    $0x08, AX
 | 
						|
	SUBQ    $0x01, R13
 | 
						|
	SUBQ    $0x01, BX
 | 
						|
	SUBQ    $0x08, DX
 | 
						|
	MOVBQZX (R13), CX
 | 
						|
	ORQ     CX, AX
 | 
						|
	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_fill_check_overread:
 | 
						|
	CMPQ DX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, R8, R14
 | 
						|
	MOVQ   AX, R15
 | 
						|
	LEAQ   (DX)(R14*1), CX
 | 
						|
	ROLQ   CL, R15
 | 
						|
	BZHIQ  R14, R15, R15
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   R8, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R15, CX
 | 
						|
	MOVQ   CX, 16(R9)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, DI, R14
 | 
						|
	MOVQ   AX, R15
 | 
						|
	LEAQ   (DX)(R14*1), CX
 | 
						|
	ROLQ   CL, R15
 | 
						|
	BZHIQ  R14, R15, R15
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   DI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R15, CX
 | 
						|
	MOVQ   CX, 8(R9)
 | 
						|
 | 
						|
	// Update literal length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, SI, R14
 | 
						|
	MOVQ   AX, R15
 | 
						|
	LEAQ   (DX)(R14*1), CX
 | 
						|
	ROLQ   CL, R15
 | 
						|
	BZHIQ  R14, R15, R15
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   SI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R15, CX
 | 
						|
	MOVQ   CX, (R9)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R13, (SP)
 | 
						|
	MOVQ    $0x00000808, CX
 | 
						|
	BEXTRQ  CX, R8, R13
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decode_56_bmi2_skip_update
 | 
						|
	LEAQ    (SI)(DI*1), R14
 | 
						|
	ADDQ    R8, R14
 | 
						|
	MOVBQZX R14, R14
 | 
						|
	LEAQ    (DX)(R14*1), CX
 | 
						|
	MOVQ    AX, R15
 | 
						|
	MOVQ    CX, DX
 | 
						|
	ROLQ    CL, R15
 | 
						|
	BZHIQ   R14, R15, R15
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	BZHIQ R8, R15, CX
 | 
						|
	SHRXQ R8, R15, R15
 | 
						|
	SHRL  $0x10, R8
 | 
						|
	ADDQ  CX, R8
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	BZHIQ DI, R15, CX
 | 
						|
	SHRXQ DI, R15, R15
 | 
						|
	SHRL  $0x10, DI
 | 
						|
	ADDQ  CX, DI
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	BZHIQ SI, R15, CX
 | 
						|
	SHRL  $0x10, SI
 | 
						|
	ADDQ  CX, SI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(SI*8), SI
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ 16(R9), CX
 | 
						|
	CMPQ R13, $0x01
 | 
						|
	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
 | 
						|
	MOVQ R11, R12
 | 
						|
	MOVQ R10, R11
 | 
						|
	MOVQ CX, R10
 | 
						|
	JMP  sequenceDecs_decode_56_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ (R9), $0x00000000
 | 
						|
	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
 | 
						|
	INCQ CX
 | 
						|
	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
 | 
						|
	MOVQ  R10, CX
 | 
						|
	JMP   sequenceDecs_decode_56_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
 | 
						|
	CMPQ CX, $0x01
 | 
						|
	JB   sequenceDecs_decode_56_bmi2_adjust_zero
 | 
						|
	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
 | 
						|
	CMPQ CX, $0x02
 | 
						|
	JA   sequenceDecs_decode_56_bmi2_adjust_three
 | 
						|
	JMP  sequenceDecs_decode_56_bmi2_adjust_two
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_zero:
 | 
						|
	MOVQ R10, R13
 | 
						|
	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_one:
 | 
						|
	MOVQ R11, R13
 | 
						|
	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_two:
 | 
						|
	MOVQ R12, R13
 | 
						|
	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_three:
 | 
						|
	LEAQ -1(R10), R13
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
 | 
						|
	MOVQ  $0x00000001, R13
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_adjust_temp_valid:
 | 
						|
	CMPQ    CX, $0x01
 | 
						|
	CMOVQNE R11, R12
 | 
						|
	MOVQ    R10, R11
 | 
						|
	MOVQ    R13, R10
 | 
						|
	MOVQ    R13, CX
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_after_adjust:
 | 
						|
	MOVQ CX, 16(R9)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  8(R9), R13
 | 
						|
	MOVQ  (R9), R14
 | 
						|
	LEAQ  (R13)(R14*1), R15
 | 
						|
	MOVQ  s+0(FP), BP
 | 
						|
	ADDQ  R15, 256(BP)
 | 
						|
	MOVQ  ctx+16(FP), R15
 | 
						|
	SUBQ  R14, 128(R15)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  R13, $0x00020002
 | 
						|
	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
 | 
						|
	ADDQ $0x18, R9
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	DECQ 96(CX)
 | 
						|
	JNS  sequenceDecs_decode_56_bmi2_main_loop
 | 
						|
	MOVQ s+0(FP), CX
 | 
						|
	MOVQ R10, 144(CX)
 | 
						|
	MOVQ R11, 152(CX)
 | 
						|
	MOVQ R12, 160(CX)
 | 
						|
	MOVQ br+8(FP), CX
 | 
						|
	MOVQ AX, 24(CX)
 | 
						|
	MOVB DL, 40(CX)
 | 
						|
	MOVQ BX, 32(CX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decode_56_bmi2_error_match_len_too_big:
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
 | 
						|
// Requires: SSE
 | 
						|
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
 | 
						|
	MOVQ  ctx+0(FP), R10
 | 
						|
	MOVQ  8(R10), CX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    empty_seqs
 | 
						|
	MOVQ  (R10), AX
 | 
						|
	MOVQ  24(R10), DX
 | 
						|
	MOVQ  32(R10), BX
 | 
						|
	MOVQ  80(R10), SI
 | 
						|
	MOVQ  104(R10), DI
 | 
						|
	MOVQ  120(R10), R8
 | 
						|
	MOVQ  56(R10), R9
 | 
						|
	MOVQ  64(R10), R10
 | 
						|
	ADDQ  R10, R9
 | 
						|
 | 
						|
	// seqsBase += 24 * seqIndex
 | 
						|
	LEAQ (DX)(DX*2), R11
 | 
						|
	SHLQ $0x03, R11
 | 
						|
	ADDQ R11, AX
 | 
						|
 | 
						|
	// outBase += outPosition
 | 
						|
	ADDQ DI, BX
 | 
						|
 | 
						|
main_loop:
 | 
						|
	MOVQ (AX), R11
 | 
						|
	MOVQ 16(AX), R12
 | 
						|
	MOVQ 8(AX), R13
 | 
						|
 | 
						|
	// Copy literals
 | 
						|
	TESTQ R11, R11
 | 
						|
	JZ    check_offset
 | 
						|
	XORQ  R14, R14
 | 
						|
 | 
						|
copy_1:
 | 
						|
	MOVUPS (SI)(R14*1), X0
 | 
						|
	MOVUPS X0, (BX)(R14*1)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	CMPQ   R14, R11
 | 
						|
	JB     copy_1
 | 
						|
	ADDQ   R11, SI
 | 
						|
	ADDQ   R11, BX
 | 
						|
	ADDQ   R11, DI
 | 
						|
 | 
						|
	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
 | 
						|
check_offset:
 | 
						|
	LEAQ (DI)(R10*1), R11
 | 
						|
	CMPQ R12, R11
 | 
						|
	JG   error_match_off_too_big
 | 
						|
	CMPQ R12, R8
 | 
						|
	JG   error_match_off_too_big
 | 
						|
 | 
						|
	// Copy match from history
 | 
						|
	MOVQ R12, R11
 | 
						|
	SUBQ DI, R11
 | 
						|
	JLS  copy_match
 | 
						|
	MOVQ R9, R14
 | 
						|
	SUBQ R11, R14
 | 
						|
	CMPQ R13, R11
 | 
						|
	JG   copy_all_from_history
 | 
						|
	MOVQ R13, R11
 | 
						|
	SUBQ $0x10, R11
 | 
						|
	JB   copy_4_small
 | 
						|
 | 
						|
copy_4_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (BX)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, BX
 | 
						|
	SUBQ   $0x10, R11
 | 
						|
	JAE    copy_4_loop
 | 
						|
	LEAQ   16(R14)(R11*1), R14
 | 
						|
	LEAQ   16(BX)(R11*1), BX
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(BX)
 | 
						|
	JMP    copy_4_end
 | 
						|
 | 
						|
copy_4_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_4_move_3
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_4_move_4through7
 | 
						|
	JMP  copy_4_move_8through16
 | 
						|
 | 
						|
copy_4_move_3:
 | 
						|
	MOVW (R14), R11
 | 
						|
	MOVB 2(R14), R12
 | 
						|
	MOVW R11, (BX)
 | 
						|
	MOVB R12, 2(BX)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, BX
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_4through7:
 | 
						|
	MOVL (R14), R11
 | 
						|
	MOVL -4(R14)(R13*1), R12
 | 
						|
	MOVL R11, (BX)
 | 
						|
	MOVL R12, -4(BX)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, BX
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_8through16:
 | 
						|
	MOVQ (R14), R11
 | 
						|
	MOVQ -8(R14)(R13*1), R12
 | 
						|
	MOVQ R11, (BX)
 | 
						|
	MOVQ R12, -8(BX)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, BX
 | 
						|
 | 
						|
copy_4_end:
 | 
						|
	ADDQ R13, DI
 | 
						|
	ADDQ $0x18, AX
 | 
						|
	INCQ DX
 | 
						|
	CMPQ DX, CX
 | 
						|
	JB   main_loop
 | 
						|
	JMP  loop_finished
 | 
						|
 | 
						|
copy_all_from_history:
 | 
						|
	MOVQ R11, R15
 | 
						|
	SUBQ $0x10, R15
 | 
						|
	JB   copy_5_small
 | 
						|
 | 
						|
copy_5_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (BX)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, BX
 | 
						|
	SUBQ   $0x10, R15
 | 
						|
	JAE    copy_5_loop
 | 
						|
	LEAQ   16(R14)(R15*1), R14
 | 
						|
	LEAQ   16(BX)(R15*1), BX
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(BX)
 | 
						|
	JMP    copy_5_end
 | 
						|
 | 
						|
copy_5_small:
 | 
						|
	CMPQ R11, $0x03
 | 
						|
	JE   copy_5_move_3
 | 
						|
	JB   copy_5_move_1or2
 | 
						|
	CMPQ R11, $0x08
 | 
						|
	JB   copy_5_move_4through7
 | 
						|
	JMP  copy_5_move_8through16
 | 
						|
 | 
						|
copy_5_move_1or2:
 | 
						|
	MOVB (R14), R15
 | 
						|
	MOVB -1(R14)(R11*1), BP
 | 
						|
	MOVB R15, (BX)
 | 
						|
	MOVB BP, -1(BX)(R11*1)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_3:
 | 
						|
	MOVW (R14), R15
 | 
						|
	MOVB 2(R14), BP
 | 
						|
	MOVW R15, (BX)
 | 
						|
	MOVB BP, 2(BX)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_4through7:
 | 
						|
	MOVL (R14), R15
 | 
						|
	MOVL -4(R14)(R11*1), BP
 | 
						|
	MOVL R15, (BX)
 | 
						|
	MOVL BP, -4(BX)(R11*1)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_8through16:
 | 
						|
	MOVQ (R14), R15
 | 
						|
	MOVQ -8(R14)(R11*1), BP
 | 
						|
	MOVQ R15, (BX)
 | 
						|
	MOVQ BP, -8(BX)(R11*1)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
 | 
						|
copy_5_end:
 | 
						|
	ADDQ R11, DI
 | 
						|
	SUBQ R11, R13
 | 
						|
 | 
						|
	// Copy match from the current buffer
 | 
						|
copy_match:
 | 
						|
	MOVQ BX, R11
 | 
						|
	SUBQ R12, R11
 | 
						|
 | 
						|
	// ml <= mo
 | 
						|
	CMPQ R13, R12
 | 
						|
	JA   copy_overlapping_match
 | 
						|
 | 
						|
	// Copy non-overlapping match
 | 
						|
	ADDQ R13, DI
 | 
						|
	MOVQ BX, R12
 | 
						|
	ADDQ R13, BX
 | 
						|
 | 
						|
copy_2:
 | 
						|
	MOVUPS (R11), X0
 | 
						|
	MOVUPS X0, (R12)
 | 
						|
	ADDQ   $0x10, R11
 | 
						|
	ADDQ   $0x10, R12
 | 
						|
	SUBQ   $0x10, R13
 | 
						|
	JHI    copy_2
 | 
						|
	JMP    handle_loop
 | 
						|
 | 
						|
	// Copy overlapping match
 | 
						|
copy_overlapping_match:
 | 
						|
	ADDQ R13, DI
 | 
						|
 | 
						|
copy_slow_3:
 | 
						|
	MOVB (R11), R12
 | 
						|
	MOVB R12, (BX)
 | 
						|
	INCQ R11
 | 
						|
	INCQ BX
 | 
						|
	DECQ R13
 | 
						|
	JNZ  copy_slow_3
 | 
						|
 | 
						|
handle_loop:
 | 
						|
	ADDQ $0x18, AX
 | 
						|
	INCQ DX
 | 
						|
	CMPQ DX, CX
 | 
						|
	JB   main_loop
 | 
						|
 | 
						|
loop_finished:
 | 
						|
	// Return value
 | 
						|
	MOVB $0x01, ret+8(FP)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+0(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVQ DI, 104(AX)
 | 
						|
	SUBQ 80(AX), SI
 | 
						|
	MOVQ SI, 112(AX)
 | 
						|
	RET
 | 
						|
 | 
						|
error_match_off_too_big:
 | 
						|
	// Return value
 | 
						|
	MOVB $0x00, ret+8(FP)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+0(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVQ DI, 104(AX)
 | 
						|
	SUBQ 80(AX), SI
 | 
						|
	MOVQ SI, 112(AX)
 | 
						|
	RET
 | 
						|
 | 
						|
empty_seqs:
 | 
						|
	// Return value
 | 
						|
	MOVB $0x01, ret+8(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
 | 
						|
// Requires: SSE
 | 
						|
TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
 | 
						|
	MOVQ  ctx+0(FP), R10
 | 
						|
	MOVQ  8(R10), CX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    empty_seqs
 | 
						|
	MOVQ  (R10), AX
 | 
						|
	MOVQ  24(R10), DX
 | 
						|
	MOVQ  32(R10), BX
 | 
						|
	MOVQ  80(R10), SI
 | 
						|
	MOVQ  104(R10), DI
 | 
						|
	MOVQ  120(R10), R8
 | 
						|
	MOVQ  56(R10), R9
 | 
						|
	MOVQ  64(R10), R10
 | 
						|
	ADDQ  R10, R9
 | 
						|
 | 
						|
	// seqsBase += 24 * seqIndex
 | 
						|
	LEAQ (DX)(DX*2), R11
 | 
						|
	SHLQ $0x03, R11
 | 
						|
	ADDQ R11, AX
 | 
						|
 | 
						|
	// outBase += outPosition
 | 
						|
	ADDQ DI, BX
 | 
						|
 | 
						|
main_loop:
 | 
						|
	MOVQ (AX), R11
 | 
						|
	MOVQ 16(AX), R12
 | 
						|
	MOVQ 8(AX), R13
 | 
						|
 | 
						|
	// Copy literals
 | 
						|
	TESTQ R11, R11
 | 
						|
	JZ    check_offset
 | 
						|
	MOVQ  R11, R14
 | 
						|
	SUBQ  $0x10, R14
 | 
						|
	JB    copy_1_small
 | 
						|
 | 
						|
copy_1_loop:
 | 
						|
	MOVUPS (SI), X0
 | 
						|
	MOVUPS X0, (BX)
 | 
						|
	ADDQ   $0x10, SI
 | 
						|
	ADDQ   $0x10, BX
 | 
						|
	SUBQ   $0x10, R14
 | 
						|
	JAE    copy_1_loop
 | 
						|
	LEAQ   16(SI)(R14*1), SI
 | 
						|
	LEAQ   16(BX)(R14*1), BX
 | 
						|
	MOVUPS -16(SI), X0
 | 
						|
	MOVUPS X0, -16(BX)
 | 
						|
	JMP    copy_1_end
 | 
						|
 | 
						|
copy_1_small:
 | 
						|
	CMPQ R11, $0x03
 | 
						|
	JE   copy_1_move_3
 | 
						|
	JB   copy_1_move_1or2
 | 
						|
	CMPQ R11, $0x08
 | 
						|
	JB   copy_1_move_4through7
 | 
						|
	JMP  copy_1_move_8through16
 | 
						|
 | 
						|
copy_1_move_1or2:
 | 
						|
	MOVB (SI), R14
 | 
						|
	MOVB -1(SI)(R11*1), R15
 | 
						|
	MOVB R14, (BX)
 | 
						|
	MOVB R15, -1(BX)(R11*1)
 | 
						|
	ADDQ R11, SI
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_3:
 | 
						|
	MOVW (SI), R14
 | 
						|
	MOVB 2(SI), R15
 | 
						|
	MOVW R14, (BX)
 | 
						|
	MOVB R15, 2(BX)
 | 
						|
	ADDQ R11, SI
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_4through7:
 | 
						|
	MOVL (SI), R14
 | 
						|
	MOVL -4(SI)(R11*1), R15
 | 
						|
	MOVL R14, (BX)
 | 
						|
	MOVL R15, -4(BX)(R11*1)
 | 
						|
	ADDQ R11, SI
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_8through16:
 | 
						|
	MOVQ (SI), R14
 | 
						|
	MOVQ -8(SI)(R11*1), R15
 | 
						|
	MOVQ R14, (BX)
 | 
						|
	MOVQ R15, -8(BX)(R11*1)
 | 
						|
	ADDQ R11, SI
 | 
						|
	ADDQ R11, BX
 | 
						|
 | 
						|
copy_1_end:
 | 
						|
	ADDQ R11, DI
 | 
						|
 | 
						|
	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
 | 
						|
check_offset:
 | 
						|
	LEAQ (DI)(R10*1), R11
 | 
						|
	CMPQ R12, R11
 | 
						|
	JG   error_match_off_too_big
 | 
						|
	CMPQ R12, R8
 | 
						|
	JG   error_match_off_too_big
 | 
						|
 | 
						|
	// Copy match from history
 | 
						|
	MOVQ R12, R11
 | 
						|
	SUBQ DI, R11
 | 
						|
	JLS  copy_match
 | 
						|
	MOVQ R9, R14
 | 
						|
	SUBQ R11, R14
 | 
						|
	CMPQ R13, R11
 | 
						|
	JG   copy_all_from_history
 | 
						|
	MOVQ R13, R11
 | 
						|
	SUBQ $0x10, R11
 | 
						|
	JB   copy_4_small
 | 
						|
 | 
						|
copy_4_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (BX)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, BX
 | 
						|
	SUBQ   $0x10, R11
 | 
						|
	JAE    copy_4_loop
 | 
						|
	LEAQ   16(R14)(R11*1), R14
 | 
						|
	LEAQ   16(BX)(R11*1), BX
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(BX)
 | 
						|
	JMP    copy_4_end
 | 
						|
 | 
						|
copy_4_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_4_move_3
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_4_move_4through7
 | 
						|
	JMP  copy_4_move_8through16
 | 
						|
 | 
						|
copy_4_move_3:
 | 
						|
	MOVW (R14), R11
 | 
						|
	MOVB 2(R14), R12
 | 
						|
	MOVW R11, (BX)
 | 
						|
	MOVB R12, 2(BX)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, BX
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_4through7:
 | 
						|
	MOVL (R14), R11
 | 
						|
	MOVL -4(R14)(R13*1), R12
 | 
						|
	MOVL R11, (BX)
 | 
						|
	MOVL R12, -4(BX)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, BX
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_8through16:
 | 
						|
	MOVQ (R14), R11
 | 
						|
	MOVQ -8(R14)(R13*1), R12
 | 
						|
	MOVQ R11, (BX)
 | 
						|
	MOVQ R12, -8(BX)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, BX
 | 
						|
 | 
						|
copy_4_end:
 | 
						|
	ADDQ R13, DI
 | 
						|
	ADDQ $0x18, AX
 | 
						|
	INCQ DX
 | 
						|
	CMPQ DX, CX
 | 
						|
	JB   main_loop
 | 
						|
	JMP  loop_finished
 | 
						|
 | 
						|
copy_all_from_history:
 | 
						|
	MOVQ R11, R15
 | 
						|
	SUBQ $0x10, R15
 | 
						|
	JB   copy_5_small
 | 
						|
 | 
						|
copy_5_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (BX)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, BX
 | 
						|
	SUBQ   $0x10, R15
 | 
						|
	JAE    copy_5_loop
 | 
						|
	LEAQ   16(R14)(R15*1), R14
 | 
						|
	LEAQ   16(BX)(R15*1), BX
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(BX)
 | 
						|
	JMP    copy_5_end
 | 
						|
 | 
						|
copy_5_small:
 | 
						|
	CMPQ R11, $0x03
 | 
						|
	JE   copy_5_move_3
 | 
						|
	JB   copy_5_move_1or2
 | 
						|
	CMPQ R11, $0x08
 | 
						|
	JB   copy_5_move_4through7
 | 
						|
	JMP  copy_5_move_8through16
 | 
						|
 | 
						|
copy_5_move_1or2:
 | 
						|
	MOVB (R14), R15
 | 
						|
	MOVB -1(R14)(R11*1), BP
 | 
						|
	MOVB R15, (BX)
 | 
						|
	MOVB BP, -1(BX)(R11*1)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_3:
 | 
						|
	MOVW (R14), R15
 | 
						|
	MOVB 2(R14), BP
 | 
						|
	MOVW R15, (BX)
 | 
						|
	MOVB BP, 2(BX)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_4through7:
 | 
						|
	MOVL (R14), R15
 | 
						|
	MOVL -4(R14)(R11*1), BP
 | 
						|
	MOVL R15, (BX)
 | 
						|
	MOVL BP, -4(BX)(R11*1)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_8through16:
 | 
						|
	MOVQ (R14), R15
 | 
						|
	MOVQ -8(R14)(R11*1), BP
 | 
						|
	MOVQ R15, (BX)
 | 
						|
	MOVQ BP, -8(BX)(R11*1)
 | 
						|
	ADDQ R11, R14
 | 
						|
	ADDQ R11, BX
 | 
						|
 | 
						|
copy_5_end:
 | 
						|
	ADDQ R11, DI
 | 
						|
	SUBQ R11, R13
 | 
						|
 | 
						|
	// Copy match from the current buffer
 | 
						|
copy_match:
 | 
						|
	MOVQ BX, R11
 | 
						|
	SUBQ R12, R11
 | 
						|
 | 
						|
	// ml <= mo
 | 
						|
	CMPQ R13, R12
 | 
						|
	JA   copy_overlapping_match
 | 
						|
 | 
						|
	// Copy non-overlapping match
 | 
						|
	ADDQ R13, DI
 | 
						|
	MOVQ R13, R12
 | 
						|
	SUBQ $0x10, R12
 | 
						|
	JB   copy_2_small
 | 
						|
 | 
						|
copy_2_loop:
 | 
						|
	MOVUPS (R11), X0
 | 
						|
	MOVUPS X0, (BX)
 | 
						|
	ADDQ   $0x10, R11
 | 
						|
	ADDQ   $0x10, BX
 | 
						|
	SUBQ   $0x10, R12
 | 
						|
	JAE    copy_2_loop
 | 
						|
	LEAQ   16(R11)(R12*1), R11
 | 
						|
	LEAQ   16(BX)(R12*1), BX
 | 
						|
	MOVUPS -16(R11), X0
 | 
						|
	MOVUPS X0, -16(BX)
 | 
						|
	JMP    copy_2_end
 | 
						|
 | 
						|
copy_2_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_2_move_3
 | 
						|
	JB   copy_2_move_1or2
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_2_move_4through7
 | 
						|
	JMP  copy_2_move_8through16
 | 
						|
 | 
						|
copy_2_move_1or2:
 | 
						|
	MOVB (R11), R12
 | 
						|
	MOVB -1(R11)(R13*1), R14
 | 
						|
	MOVB R12, (BX)
 | 
						|
	MOVB R14, -1(BX)(R13*1)
 | 
						|
	ADDQ R13, R11
 | 
						|
	ADDQ R13, BX
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_3:
 | 
						|
	MOVW (R11), R12
 | 
						|
	MOVB 2(R11), R14
 | 
						|
	MOVW R12, (BX)
 | 
						|
	MOVB R14, 2(BX)
 | 
						|
	ADDQ R13, R11
 | 
						|
	ADDQ R13, BX
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_4through7:
 | 
						|
	MOVL (R11), R12
 | 
						|
	MOVL -4(R11)(R13*1), R14
 | 
						|
	MOVL R12, (BX)
 | 
						|
	MOVL R14, -4(BX)(R13*1)
 | 
						|
	ADDQ R13, R11
 | 
						|
	ADDQ R13, BX
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_8through16:
 | 
						|
	MOVQ (R11), R12
 | 
						|
	MOVQ -8(R11)(R13*1), R14
 | 
						|
	MOVQ R12, (BX)
 | 
						|
	MOVQ R14, -8(BX)(R13*1)
 | 
						|
	ADDQ R13, R11
 | 
						|
	ADDQ R13, BX
 | 
						|
 | 
						|
copy_2_end:
 | 
						|
	JMP handle_loop
 | 
						|
 | 
						|
	// Copy overlapping match
 | 
						|
copy_overlapping_match:
 | 
						|
	ADDQ R13, DI
 | 
						|
 | 
						|
copy_slow_3:
 | 
						|
	MOVB (R11), R12
 | 
						|
	MOVB R12, (BX)
 | 
						|
	INCQ R11
 | 
						|
	INCQ BX
 | 
						|
	DECQ R13
 | 
						|
	JNZ  copy_slow_3
 | 
						|
 | 
						|
handle_loop:
 | 
						|
	ADDQ $0x18, AX
 | 
						|
	INCQ DX
 | 
						|
	CMPQ DX, CX
 | 
						|
	JB   main_loop
 | 
						|
 | 
						|
loop_finished:
 | 
						|
	// Return value
 | 
						|
	MOVB $0x01, ret+8(FP)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+0(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVQ DI, 104(AX)
 | 
						|
	SUBQ 80(AX), SI
 | 
						|
	MOVQ SI, 112(AX)
 | 
						|
	RET
 | 
						|
 | 
						|
error_match_off_too_big:
 | 
						|
	// Return value
 | 
						|
	MOVB $0x00, ret+8(FP)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+0(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVQ DI, 104(AX)
 | 
						|
	SUBQ 80(AX), SI
 | 
						|
	MOVQ SI, 112(AX)
 | 
						|
	RET
 | 
						|
 | 
						|
empty_seqs:
 | 
						|
	// Return value
 | 
						|
	MOVB $0x01, ret+8(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 | 
						|
// Requires: CMOV, SSE
 | 
						|
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
 | 
						|
	MOVQ    br+8(FP), CX
 | 
						|
	MOVQ    24(CX), DX
 | 
						|
	MOVBQZX 40(CX), BX
 | 
						|
	MOVQ    (CX), AX
 | 
						|
	MOVQ    32(CX), SI
 | 
						|
	ADDQ    SI, AX
 | 
						|
	MOVQ    AX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), AX
 | 
						|
	MOVQ    72(AX), DI
 | 
						|
	MOVQ    80(AX), R8
 | 
						|
	MOVQ    88(AX), R9
 | 
						|
	XORQ    CX, CX
 | 
						|
	MOVQ    CX, 8(SP)
 | 
						|
	MOVQ    CX, 16(SP)
 | 
						|
	MOVQ    CX, 24(SP)
 | 
						|
	MOVQ    112(AX), R10
 | 
						|
	MOVQ    128(AX), CX
 | 
						|
	MOVQ    CX, 32(SP)
 | 
						|
	MOVQ    144(AX), R11
 | 
						|
	MOVQ    136(AX), R12
 | 
						|
	MOVQ    200(AX), CX
 | 
						|
	MOVQ    CX, 56(SP)
 | 
						|
	MOVQ    176(AX), CX
 | 
						|
	MOVQ    CX, 48(SP)
 | 
						|
	MOVQ    184(AX), AX
 | 
						|
	MOVQ    AX, 40(SP)
 | 
						|
	MOVQ    40(SP), AX
 | 
						|
	ADDQ    AX, 48(SP)
 | 
						|
 | 
						|
	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 | 
						|
	ADDQ R10, 32(SP)
 | 
						|
 | 
						|
	// outBase += outPosition
 | 
						|
	ADDQ R12, R10
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_main_loop:
 | 
						|
	MOVQ (SP), R13
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ SI, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
 | 
						|
	MOVQ BX, AX
 | 
						|
	SHRQ $0x03, AX
 | 
						|
	SUBQ AX, R13
 | 
						|
	MOVQ (R13), DX
 | 
						|
	SUBQ AX, SI
 | 
						|
	ANDQ $0x07, BX
 | 
						|
	JMP  sequenceDecs_decodeSync_amd64_fill_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
 | 
						|
	CMPQ    SI, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
 | 
						|
	CMPQ    BX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_amd64_fill_end
 | 
						|
	SHLQ    $0x08, DX
 | 
						|
	SUBQ    $0x01, R13
 | 
						|
	SUBQ    $0x01, SI
 | 
						|
	SUBQ    $0x08, BX
 | 
						|
	MOVBQZX (R13), AX
 | 
						|
	ORQ     AX, DX
 | 
						|
	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_fill_check_overread:
 | 
						|
	CMPQ BX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ  R9, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R14
 | 
						|
	SHLQ  CL, R14
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decodeSync_amd64_of_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R14
 | 
						|
	ADDQ  R14, AX
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_of_update_zero:
 | 
						|
	MOVQ AX, 8(SP)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ  R8, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R14
 | 
						|
	SHLQ  CL, R14
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R14
 | 
						|
	ADDQ  R14, AX
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_ml_update_zero:
 | 
						|
	MOVQ AX, 16(SP)
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the remaining
 | 
						|
	CMPQ SI, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
 | 
						|
	MOVQ BX, AX
 | 
						|
	SHRQ $0x03, AX
 | 
						|
	SUBQ AX, R13
 | 
						|
	MOVQ (R13), DX
 | 
						|
	SUBQ AX, SI
 | 
						|
	ANDQ $0x07, BX
 | 
						|
	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
 | 
						|
	CMPQ    SI, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
 | 
						|
	CMPQ    BX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
 | 
						|
	SHLQ    $0x08, DX
 | 
						|
	SUBQ    $0x01, R13
 | 
						|
	SUBQ    $0x01, SI
 | 
						|
	SUBQ    $0x08, BX
 | 
						|
	MOVBQZX (R13), AX
 | 
						|
	ORQ     AX, DX
 | 
						|
	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_fill_2_check_overread:
 | 
						|
	CMPQ BX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_fill_2_end:
 | 
						|
	// Update literal length
 | 
						|
	MOVQ  DI, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R14
 | 
						|
	SHLQ  CL, R14
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R14
 | 
						|
	ADDQ  R14, AX
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_ll_update_zero:
 | 
						|
	MOVQ AX, 24(SP)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R13, (SP)
 | 
						|
	MOVQ    R9, AX
 | 
						|
	SHRQ    $0x08, AX
 | 
						|
	MOVBQZX AL, AX
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decodeSync_amd64_skip_update
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	MOVBQZX DI, R13
 | 
						|
	SHRL    $0x10, DI
 | 
						|
	LEAQ    (BX)(R13*1), CX
 | 
						|
	MOVQ    DX, R14
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	MOVL    $0x00000001, R15
 | 
						|
	MOVB    R13, CL
 | 
						|
	SHLL    CL, R15
 | 
						|
	DECL    R15
 | 
						|
	ANDQ    R15, R14
 | 
						|
	ADDQ    R14, DI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	MOVBQZX R8, R13
 | 
						|
	SHRL    $0x10, R8
 | 
						|
	LEAQ    (BX)(R13*1), CX
 | 
						|
	MOVQ    DX, R14
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	MOVL    $0x00000001, R15
 | 
						|
	MOVB    R13, CL
 | 
						|
	SHLL    CL, R15
 | 
						|
	DECL    R15
 | 
						|
	ANDQ    R15, R14
 | 
						|
	ADDQ    R14, R8
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	MOVBQZX R9, R13
 | 
						|
	SHRL    $0x10, R9
 | 
						|
	LEAQ    (BX)(R13*1), CX
 | 
						|
	MOVQ    DX, R14
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	MOVL    $0x00000001, R15
 | 
						|
	MOVB    R13, CL
 | 
						|
	SHLL    CL, R15
 | 
						|
	DECL    R15
 | 
						|
	ANDQ    R15, R14
 | 
						|
	ADDQ    R14, R9
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R9*8), R9
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ   s+0(FP), CX
 | 
						|
	MOVQ   8(SP), R13
 | 
						|
	CMPQ   AX, $0x01
 | 
						|
	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
 | 
						|
	MOVUPS 144(CX), X0
 | 
						|
	MOVQ   R13, 144(CX)
 | 
						|
	MOVUPS X0, 152(CX)
 | 
						|
	JMP    sequenceDecs_decodeSync_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ 24(SP), $0x00000000
 | 
						|
	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
 | 
						|
	INCQ R13
 | 
						|
	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
 | 
						|
	MOVQ  144(CX), R13
 | 
						|
	JMP   sequenceDecs_decodeSync_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
 | 
						|
	MOVQ    R13, AX
 | 
						|
	XORQ    R14, R14
 | 
						|
	MOVQ    $-1, R15
 | 
						|
	CMPQ    R13, $0x03
 | 
						|
	CMOVQEQ R14, AX
 | 
						|
	CMOVQEQ R15, R14
 | 
						|
	ADDQ    144(CX)(AX*8), R14
 | 
						|
	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
 | 
						|
	MOVQ    $0x00000001, R14
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_adjust_temp_valid:
 | 
						|
	CMPQ R13, $0x01
 | 
						|
	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
 | 
						|
	MOVQ 152(CX), AX
 | 
						|
	MOVQ AX, 160(CX)
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_adjust_skip:
 | 
						|
	MOVQ 144(CX), AX
 | 
						|
	MOVQ AX, 152(CX)
 | 
						|
	MOVQ R14, 144(CX)
 | 
						|
	MOVQ R14, R13
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_after_adjust:
 | 
						|
	MOVQ R13, 8(SP)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  16(SP), AX
 | 
						|
	MOVQ  24(SP), CX
 | 
						|
	LEAQ  (AX)(CX*1), R14
 | 
						|
	MOVQ  s+0(FP), R15
 | 
						|
	ADDQ  R14, 256(R15)
 | 
						|
	MOVQ  ctx+16(FP), R14
 | 
						|
	SUBQ  CX, 104(R14)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  AX, $0x00020002
 | 
						|
	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
 | 
						|
	TESTQ AX, AX
 | 
						|
	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
 | 
						|
	MOVQ 24(SP), AX
 | 
						|
	MOVQ 8(SP), CX
 | 
						|
	MOVQ 16(SP), R13
 | 
						|
 | 
						|
	// Check if we have enough space in s.out
 | 
						|
	LEAQ (AX)(R13*1), R14
 | 
						|
	ADDQ R10, R14
 | 
						|
	CMPQ R14, 32(SP)
 | 
						|
	JA   error_not_enough_space
 | 
						|
 | 
						|
	// Copy literals
 | 
						|
	TESTQ AX, AX
 | 
						|
	JZ    check_offset
 | 
						|
	XORQ  R14, R14
 | 
						|
 | 
						|
copy_1:
 | 
						|
	MOVUPS (R11)(R14*1), X0
 | 
						|
	MOVUPS X0, (R10)(R14*1)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	CMPQ   R14, AX
 | 
						|
	JB     copy_1
 | 
						|
	ADDQ   AX, R11
 | 
						|
	ADDQ   AX, R10
 | 
						|
	ADDQ   AX, R12
 | 
						|
 | 
						|
	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
 | 
						|
check_offset:
 | 
						|
	MOVQ R12, AX
 | 
						|
	ADDQ 40(SP), AX
 | 
						|
	CMPQ CX, AX
 | 
						|
	JG   error_match_off_too_big
 | 
						|
	CMPQ CX, 56(SP)
 | 
						|
	JG   error_match_off_too_big
 | 
						|
 | 
						|
	// Copy match from history
 | 
						|
	MOVQ CX, AX
 | 
						|
	SUBQ R12, AX
 | 
						|
	JLS  copy_match
 | 
						|
	MOVQ 48(SP), R14
 | 
						|
	SUBQ AX, R14
 | 
						|
	CMPQ R13, AX
 | 
						|
	JG   copy_all_from_history
 | 
						|
	MOVQ R13, AX
 | 
						|
	SUBQ $0x10, AX
 | 
						|
	JB   copy_4_small
 | 
						|
 | 
						|
copy_4_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R10)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R10
 | 
						|
	SUBQ   $0x10, AX
 | 
						|
	JAE    copy_4_loop
 | 
						|
	LEAQ   16(R14)(AX*1), R14
 | 
						|
	LEAQ   16(R10)(AX*1), R10
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R10)
 | 
						|
	JMP    copy_4_end
 | 
						|
 | 
						|
copy_4_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_4_move_3
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_4_move_4through7
 | 
						|
	JMP  copy_4_move_8through16
 | 
						|
 | 
						|
copy_4_move_3:
 | 
						|
	MOVW (R14), AX
 | 
						|
	MOVB 2(R14), CL
 | 
						|
	MOVW AX, (R10)
 | 
						|
	MOVB CL, 2(R10)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R10
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_4through7:
 | 
						|
	MOVL (R14), AX
 | 
						|
	MOVL -4(R14)(R13*1), CX
 | 
						|
	MOVL AX, (R10)
 | 
						|
	MOVL CX, -4(R10)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R10
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_8through16:
 | 
						|
	MOVQ (R14), AX
 | 
						|
	MOVQ -8(R14)(R13*1), CX
 | 
						|
	MOVQ AX, (R10)
 | 
						|
	MOVQ CX, -8(R10)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R10
 | 
						|
 | 
						|
copy_4_end:
 | 
						|
	ADDQ R13, R12
 | 
						|
	JMP  handle_loop
 | 
						|
	JMP loop_finished
 | 
						|
 | 
						|
copy_all_from_history:
 | 
						|
	MOVQ AX, R15
 | 
						|
	SUBQ $0x10, R15
 | 
						|
	JB   copy_5_small
 | 
						|
 | 
						|
copy_5_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R10)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R10
 | 
						|
	SUBQ   $0x10, R15
 | 
						|
	JAE    copy_5_loop
 | 
						|
	LEAQ   16(R14)(R15*1), R14
 | 
						|
	LEAQ   16(R10)(R15*1), R10
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R10)
 | 
						|
	JMP    copy_5_end
 | 
						|
 | 
						|
copy_5_small:
 | 
						|
	CMPQ AX, $0x03
 | 
						|
	JE   copy_5_move_3
 | 
						|
	JB   copy_5_move_1or2
 | 
						|
	CMPQ AX, $0x08
 | 
						|
	JB   copy_5_move_4through7
 | 
						|
	JMP  copy_5_move_8through16
 | 
						|
 | 
						|
copy_5_move_1or2:
 | 
						|
	MOVB (R14), R15
 | 
						|
	MOVB -1(R14)(AX*1), BP
 | 
						|
	MOVB R15, (R10)
 | 
						|
	MOVB BP, -1(R10)(AX*1)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_3:
 | 
						|
	MOVW (R14), R15
 | 
						|
	MOVB 2(R14), BP
 | 
						|
	MOVW R15, (R10)
 | 
						|
	MOVB BP, 2(R10)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_4through7:
 | 
						|
	MOVL (R14), R15
 | 
						|
	MOVL -4(R14)(AX*1), BP
 | 
						|
	MOVL R15, (R10)
 | 
						|
	MOVL BP, -4(R10)(AX*1)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_8through16:
 | 
						|
	MOVQ (R14), R15
 | 
						|
	MOVQ -8(R14)(AX*1), BP
 | 
						|
	MOVQ R15, (R10)
 | 
						|
	MOVQ BP, -8(R10)(AX*1)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
 | 
						|
copy_5_end:
 | 
						|
	ADDQ AX, R12
 | 
						|
	SUBQ AX, R13
 | 
						|
 | 
						|
	// Copy match from the current buffer
 | 
						|
copy_match:
 | 
						|
	MOVQ R10, AX
 | 
						|
	SUBQ CX, AX
 | 
						|
 | 
						|
	// ml <= mo
 | 
						|
	CMPQ R13, CX
 | 
						|
	JA   copy_overlapping_match
 | 
						|
 | 
						|
	// Copy non-overlapping match
 | 
						|
	ADDQ R13, R12
 | 
						|
	MOVQ R10, CX
 | 
						|
	ADDQ R13, R10
 | 
						|
 | 
						|
copy_2:
 | 
						|
	MOVUPS (AX), X0
 | 
						|
	MOVUPS X0, (CX)
 | 
						|
	ADDQ   $0x10, AX
 | 
						|
	ADDQ   $0x10, CX
 | 
						|
	SUBQ   $0x10, R13
 | 
						|
	JHI    copy_2
 | 
						|
	JMP    handle_loop
 | 
						|
 | 
						|
	// Copy overlapping match
 | 
						|
copy_overlapping_match:
 | 
						|
	ADDQ R13, R12
 | 
						|
 | 
						|
copy_slow_3:
 | 
						|
	MOVB (AX), CL
 | 
						|
	MOVB CL, (R10)
 | 
						|
	INCQ AX
 | 
						|
	INCQ R10
 | 
						|
	DECQ R13
 | 
						|
	JNZ  copy_slow_3
 | 
						|
 | 
						|
handle_loop:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	DECQ 96(AX)
 | 
						|
	JNS  sequenceDecs_decodeSync_amd64_main_loop
 | 
						|
 | 
						|
loop_finished:
 | 
						|
	MOVQ br+8(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVB BL, 40(AX)
 | 
						|
	MOVQ SI, 32(AX)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ R12, 136(AX)
 | 
						|
	MOVQ 144(AX), CX
 | 
						|
	SUBQ CX, R11
 | 
						|
	MOVQ R11, 168(AX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ 16(SP), AX
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ AX, 216(CX)
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decodeSync_amd64_error_match_len_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
error_match_off_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 8(SP), CX
 | 
						|
	MOVQ CX, 224(AX)
 | 
						|
	MOVQ R12, 136(AX)
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough output space error
 | 
						|
error_not_enough_space:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ R12, 136(AX)
 | 
						|
	MOVQ $0x00000005, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 | 
						|
// Requires: BMI, BMI2, CMOV, SSE
 | 
						|
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
 | 
						|
	MOVQ    br+8(FP), BX
 | 
						|
	MOVQ    24(BX), AX
 | 
						|
	MOVBQZX 40(BX), DX
 | 
						|
	MOVQ    (BX), CX
 | 
						|
	MOVQ    32(BX), BX
 | 
						|
	ADDQ    BX, CX
 | 
						|
	MOVQ    CX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	MOVQ    72(CX), SI
 | 
						|
	MOVQ    80(CX), DI
 | 
						|
	MOVQ    88(CX), R8
 | 
						|
	XORQ    R9, R9
 | 
						|
	MOVQ    R9, 8(SP)
 | 
						|
	MOVQ    R9, 16(SP)
 | 
						|
	MOVQ    R9, 24(SP)
 | 
						|
	MOVQ    112(CX), R9
 | 
						|
	MOVQ    128(CX), R10
 | 
						|
	MOVQ    R10, 32(SP)
 | 
						|
	MOVQ    144(CX), R10
 | 
						|
	MOVQ    136(CX), R11
 | 
						|
	MOVQ    200(CX), R12
 | 
						|
	MOVQ    R12, 56(SP)
 | 
						|
	MOVQ    176(CX), R12
 | 
						|
	MOVQ    R12, 48(SP)
 | 
						|
	MOVQ    184(CX), CX
 | 
						|
	MOVQ    CX, 40(SP)
 | 
						|
	MOVQ    40(SP), CX
 | 
						|
	ADDQ    CX, 48(SP)
 | 
						|
 | 
						|
	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 | 
						|
	ADDQ R9, 32(SP)
 | 
						|
 | 
						|
	// outBase += outPosition
 | 
						|
	ADDQ R11, R9
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_main_loop:
 | 
						|
	MOVQ (SP), R12
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ BX, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
 | 
						|
	MOVQ DX, CX
 | 
						|
	SHRQ $0x03, CX
 | 
						|
	SUBQ CX, R12
 | 
						|
	MOVQ (R12), AX
 | 
						|
	SUBQ CX, BX
 | 
						|
	ANDQ $0x07, DX
 | 
						|
	JMP  sequenceDecs_decodeSync_bmi2_fill_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
 | 
						|
	CMPQ    BX, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
 | 
						|
	CMPQ    DX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_bmi2_fill_end
 | 
						|
	SHLQ    $0x08, AX
 | 
						|
	SUBQ    $0x01, R12
 | 
						|
	SUBQ    $0x01, BX
 | 
						|
	SUBQ    $0x08, DX
 | 
						|
	MOVBQZX (R12), CX
 | 
						|
	ORQ     CX, AX
 | 
						|
	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_fill_check_overread:
 | 
						|
	CMPQ DX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, R8, R13
 | 
						|
	MOVQ   AX, R14
 | 
						|
	LEAQ   (DX)(R13*1), CX
 | 
						|
	ROLQ   CL, R14
 | 
						|
	BZHIQ  R13, R14, R14
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   R8, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R14, CX
 | 
						|
	MOVQ   CX, 8(SP)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, DI, R13
 | 
						|
	MOVQ   AX, R14
 | 
						|
	LEAQ   (DX)(R13*1), CX
 | 
						|
	ROLQ   CL, R14
 | 
						|
	BZHIQ  R13, R14, R14
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   DI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R14, CX
 | 
						|
	MOVQ   CX, 16(SP)
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the remaining
 | 
						|
	CMPQ BX, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
 | 
						|
	MOVQ DX, CX
 | 
						|
	SHRQ $0x03, CX
 | 
						|
	SUBQ CX, R12
 | 
						|
	MOVQ (R12), AX
 | 
						|
	SUBQ CX, BX
 | 
						|
	ANDQ $0x07, DX
 | 
						|
	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
 | 
						|
	CMPQ    BX, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
 | 
						|
	CMPQ    DX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
 | 
						|
	SHLQ    $0x08, AX
 | 
						|
	SUBQ    $0x01, R12
 | 
						|
	SUBQ    $0x01, BX
 | 
						|
	SUBQ    $0x08, DX
 | 
						|
	MOVBQZX (R12), CX
 | 
						|
	ORQ     CX, AX
 | 
						|
	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
 | 
						|
	CMPQ DX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_fill_2_end:
 | 
						|
	// Update literal length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, SI, R13
 | 
						|
	MOVQ   AX, R14
 | 
						|
	LEAQ   (DX)(R13*1), CX
 | 
						|
	ROLQ   CL, R14
 | 
						|
	BZHIQ  R13, R14, R14
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   SI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R14, CX
 | 
						|
	MOVQ   CX, 24(SP)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R12, (SP)
 | 
						|
	MOVQ    $0x00000808, CX
 | 
						|
	BEXTRQ  CX, R8, R12
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decodeSync_bmi2_skip_update
 | 
						|
	LEAQ    (SI)(DI*1), R13
 | 
						|
	ADDQ    R8, R13
 | 
						|
	MOVBQZX R13, R13
 | 
						|
	LEAQ    (DX)(R13*1), CX
 | 
						|
	MOVQ    AX, R14
 | 
						|
	MOVQ    CX, DX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	BZHIQ   R13, R14, R14
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	BZHIQ R8, R14, CX
 | 
						|
	SHRXQ R8, R14, R14
 | 
						|
	SHRL  $0x10, R8
 | 
						|
	ADDQ  CX, R8
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	BZHIQ DI, R14, CX
 | 
						|
	SHRXQ DI, R14, R14
 | 
						|
	SHRL  $0x10, DI
 | 
						|
	ADDQ  CX, DI
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	BZHIQ SI, R14, CX
 | 
						|
	SHRL  $0x10, SI
 | 
						|
	ADDQ  CX, SI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(SI*8), SI
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ   s+0(FP), CX
 | 
						|
	MOVQ   8(SP), R13
 | 
						|
	CMPQ   R12, $0x01
 | 
						|
	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
 | 
						|
	MOVUPS 144(CX), X0
 | 
						|
	MOVQ   R13, 144(CX)
 | 
						|
	MOVUPS X0, 152(CX)
 | 
						|
	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ 24(SP), $0x00000000
 | 
						|
	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
 | 
						|
	INCQ R13
 | 
						|
	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
 | 
						|
	MOVQ  144(CX), R13
 | 
						|
	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
 | 
						|
	MOVQ    R13, R12
 | 
						|
	XORQ    R14, R14
 | 
						|
	MOVQ    $-1, R15
 | 
						|
	CMPQ    R13, $0x03
 | 
						|
	CMOVQEQ R14, R12
 | 
						|
	CMOVQEQ R15, R14
 | 
						|
	ADDQ    144(CX)(R12*8), R14
 | 
						|
	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
 | 
						|
	MOVQ    $0x00000001, R14
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
 | 
						|
	CMPQ R13, $0x01
 | 
						|
	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
 | 
						|
	MOVQ 152(CX), R12
 | 
						|
	MOVQ R12, 160(CX)
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_adjust_skip:
 | 
						|
	MOVQ 144(CX), R12
 | 
						|
	MOVQ R12, 152(CX)
 | 
						|
	MOVQ R14, 144(CX)
 | 
						|
	MOVQ R14, R13
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_after_adjust:
 | 
						|
	MOVQ R13, 8(SP)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  16(SP), CX
 | 
						|
	MOVQ  24(SP), R12
 | 
						|
	LEAQ  (CX)(R12*1), R14
 | 
						|
	MOVQ  s+0(FP), R15
 | 
						|
	ADDQ  R14, 256(R15)
 | 
						|
	MOVQ  ctx+16(FP), R14
 | 
						|
	SUBQ  R12, 104(R14)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  CX, $0x00020002
 | 
						|
	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ 8(SP), R12
 | 
						|
	MOVQ 16(SP), R13
 | 
						|
 | 
						|
	// Check if we have enough space in s.out
 | 
						|
	LEAQ (CX)(R13*1), R14
 | 
						|
	ADDQ R9, R14
 | 
						|
	CMPQ R14, 32(SP)
 | 
						|
	JA   error_not_enough_space
 | 
						|
 | 
						|
	// Copy literals
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    check_offset
 | 
						|
	XORQ  R14, R14
 | 
						|
 | 
						|
copy_1:
 | 
						|
	MOVUPS (R10)(R14*1), X0
 | 
						|
	MOVUPS X0, (R9)(R14*1)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	CMPQ   R14, CX
 | 
						|
	JB     copy_1
 | 
						|
	ADDQ   CX, R10
 | 
						|
	ADDQ   CX, R9
 | 
						|
	ADDQ   CX, R11
 | 
						|
 | 
						|
	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
 | 
						|
check_offset:
 | 
						|
	MOVQ R11, CX
 | 
						|
	ADDQ 40(SP), CX
 | 
						|
	CMPQ R12, CX
 | 
						|
	JG   error_match_off_too_big
 | 
						|
	CMPQ R12, 56(SP)
 | 
						|
	JG   error_match_off_too_big
 | 
						|
 | 
						|
	// Copy match from history
 | 
						|
	MOVQ R12, CX
 | 
						|
	SUBQ R11, CX
 | 
						|
	JLS  copy_match
 | 
						|
	MOVQ 48(SP), R14
 | 
						|
	SUBQ CX, R14
 | 
						|
	CMPQ R13, CX
 | 
						|
	JG   copy_all_from_history
 | 
						|
	MOVQ R13, CX
 | 
						|
	SUBQ $0x10, CX
 | 
						|
	JB   copy_4_small
 | 
						|
 | 
						|
copy_4_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R9)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R9
 | 
						|
	SUBQ   $0x10, CX
 | 
						|
	JAE    copy_4_loop
 | 
						|
	LEAQ   16(R14)(CX*1), R14
 | 
						|
	LEAQ   16(R9)(CX*1), R9
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R9)
 | 
						|
	JMP    copy_4_end
 | 
						|
 | 
						|
copy_4_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_4_move_3
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_4_move_4through7
 | 
						|
	JMP  copy_4_move_8through16
 | 
						|
 | 
						|
copy_4_move_3:
 | 
						|
	MOVW (R14), CX
 | 
						|
	MOVB 2(R14), R12
 | 
						|
	MOVW CX, (R9)
 | 
						|
	MOVB R12, 2(R9)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R9
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_4through7:
 | 
						|
	MOVL (R14), CX
 | 
						|
	MOVL -4(R14)(R13*1), R12
 | 
						|
	MOVL CX, (R9)
 | 
						|
	MOVL R12, -4(R9)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R9
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_8through16:
 | 
						|
	MOVQ (R14), CX
 | 
						|
	MOVQ -8(R14)(R13*1), R12
 | 
						|
	MOVQ CX, (R9)
 | 
						|
	MOVQ R12, -8(R9)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R9
 | 
						|
 | 
						|
copy_4_end:
 | 
						|
	ADDQ R13, R11
 | 
						|
	JMP  handle_loop
 | 
						|
	JMP loop_finished
 | 
						|
 | 
						|
copy_all_from_history:
 | 
						|
	MOVQ CX, R15
 | 
						|
	SUBQ $0x10, R15
 | 
						|
	JB   copy_5_small
 | 
						|
 | 
						|
copy_5_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R9)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R9
 | 
						|
	SUBQ   $0x10, R15
 | 
						|
	JAE    copy_5_loop
 | 
						|
	LEAQ   16(R14)(R15*1), R14
 | 
						|
	LEAQ   16(R9)(R15*1), R9
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R9)
 | 
						|
	JMP    copy_5_end
 | 
						|
 | 
						|
copy_5_small:
 | 
						|
	CMPQ CX, $0x03
 | 
						|
	JE   copy_5_move_3
 | 
						|
	JB   copy_5_move_1or2
 | 
						|
	CMPQ CX, $0x08
 | 
						|
	JB   copy_5_move_4through7
 | 
						|
	JMP  copy_5_move_8through16
 | 
						|
 | 
						|
copy_5_move_1or2:
 | 
						|
	MOVB (R14), R15
 | 
						|
	MOVB -1(R14)(CX*1), BP
 | 
						|
	MOVB R15, (R9)
 | 
						|
	MOVB BP, -1(R9)(CX*1)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_3:
 | 
						|
	MOVW (R14), R15
 | 
						|
	MOVB 2(R14), BP
 | 
						|
	MOVW R15, (R9)
 | 
						|
	MOVB BP, 2(R9)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_4through7:
 | 
						|
	MOVL (R14), R15
 | 
						|
	MOVL -4(R14)(CX*1), BP
 | 
						|
	MOVL R15, (R9)
 | 
						|
	MOVL BP, -4(R9)(CX*1)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_8through16:
 | 
						|
	MOVQ (R14), R15
 | 
						|
	MOVQ -8(R14)(CX*1), BP
 | 
						|
	MOVQ R15, (R9)
 | 
						|
	MOVQ BP, -8(R9)(CX*1)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
 | 
						|
copy_5_end:
 | 
						|
	ADDQ CX, R11
 | 
						|
	SUBQ CX, R13
 | 
						|
 | 
						|
	// Copy match from the current buffer
 | 
						|
copy_match:
 | 
						|
	MOVQ R9, CX
 | 
						|
	SUBQ R12, CX
 | 
						|
 | 
						|
	// ml <= mo
 | 
						|
	CMPQ R13, R12
 | 
						|
	JA   copy_overlapping_match
 | 
						|
 | 
						|
	// Copy non-overlapping match
 | 
						|
	ADDQ R13, R11
 | 
						|
	MOVQ R9, R12
 | 
						|
	ADDQ R13, R9
 | 
						|
 | 
						|
copy_2:
 | 
						|
	MOVUPS (CX), X0
 | 
						|
	MOVUPS X0, (R12)
 | 
						|
	ADDQ   $0x10, CX
 | 
						|
	ADDQ   $0x10, R12
 | 
						|
	SUBQ   $0x10, R13
 | 
						|
	JHI    copy_2
 | 
						|
	JMP    handle_loop
 | 
						|
 | 
						|
	// Copy overlapping match
 | 
						|
copy_overlapping_match:
 | 
						|
	ADDQ R13, R11
 | 
						|
 | 
						|
copy_slow_3:
 | 
						|
	MOVB (CX), R12
 | 
						|
	MOVB R12, (R9)
 | 
						|
	INCQ CX
 | 
						|
	INCQ R9
 | 
						|
	DECQ R13
 | 
						|
	JNZ  copy_slow_3
 | 
						|
 | 
						|
handle_loop:
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	DECQ 96(CX)
 | 
						|
	JNS  sequenceDecs_decodeSync_bmi2_main_loop
 | 
						|
 | 
						|
loop_finished:
 | 
						|
	MOVQ br+8(FP), CX
 | 
						|
	MOVQ AX, 24(CX)
 | 
						|
	MOVB DL, 40(CX)
 | 
						|
	MOVQ BX, 32(CX)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ R11, 136(AX)
 | 
						|
	MOVQ 144(AX), CX
 | 
						|
	SUBQ CX, R10
 | 
						|
	MOVQ R10, 168(AX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ 16(SP), AX
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ AX, 216(CX)
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
error_match_off_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 8(SP), CX
 | 
						|
	MOVQ CX, 224(AX)
 | 
						|
	MOVQ R11, 136(AX)
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough output space error
 | 
						|
error_not_enough_space:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ R11, 136(AX)
 | 
						|
	MOVQ $0x00000005, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 | 
						|
// Requires: CMOV, SSE
 | 
						|
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
 | 
						|
	MOVQ    br+8(FP), CX
 | 
						|
	MOVQ    24(CX), DX
 | 
						|
	MOVBQZX 40(CX), BX
 | 
						|
	MOVQ    (CX), AX
 | 
						|
	MOVQ    32(CX), SI
 | 
						|
	ADDQ    SI, AX
 | 
						|
	MOVQ    AX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), AX
 | 
						|
	MOVQ    72(AX), DI
 | 
						|
	MOVQ    80(AX), R8
 | 
						|
	MOVQ    88(AX), R9
 | 
						|
	XORQ    CX, CX
 | 
						|
	MOVQ    CX, 8(SP)
 | 
						|
	MOVQ    CX, 16(SP)
 | 
						|
	MOVQ    CX, 24(SP)
 | 
						|
	MOVQ    112(AX), R10
 | 
						|
	MOVQ    128(AX), CX
 | 
						|
	MOVQ    CX, 32(SP)
 | 
						|
	MOVQ    144(AX), R11
 | 
						|
	MOVQ    136(AX), R12
 | 
						|
	MOVQ    200(AX), CX
 | 
						|
	MOVQ    CX, 56(SP)
 | 
						|
	MOVQ    176(AX), CX
 | 
						|
	MOVQ    CX, 48(SP)
 | 
						|
	MOVQ    184(AX), AX
 | 
						|
	MOVQ    AX, 40(SP)
 | 
						|
	MOVQ    40(SP), AX
 | 
						|
	ADDQ    AX, 48(SP)
 | 
						|
 | 
						|
	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 | 
						|
	ADDQ R10, 32(SP)
 | 
						|
 | 
						|
	// outBase += outPosition
 | 
						|
	ADDQ R12, R10
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_main_loop:
 | 
						|
	MOVQ (SP), R13
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ SI, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
 | 
						|
	MOVQ BX, AX
 | 
						|
	SHRQ $0x03, AX
 | 
						|
	SUBQ AX, R13
 | 
						|
	MOVQ (R13), DX
 | 
						|
	SUBQ AX, SI
 | 
						|
	ANDQ $0x07, BX
 | 
						|
	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
 | 
						|
	CMPQ    SI, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
 | 
						|
	CMPQ    BX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
 | 
						|
	SHLQ    $0x08, DX
 | 
						|
	SUBQ    $0x01, R13
 | 
						|
	SUBQ    $0x01, SI
 | 
						|
	SUBQ    $0x08, BX
 | 
						|
	MOVBQZX (R13), AX
 | 
						|
	ORQ     AX, DX
 | 
						|
	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
 | 
						|
	CMPQ BX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ  R9, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R14
 | 
						|
	SHLQ  CL, R14
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R14
 | 
						|
	ADDQ  R14, AX
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_of_update_zero:
 | 
						|
	MOVQ AX, 8(SP)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ  R8, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R14
 | 
						|
	SHLQ  CL, R14
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R14
 | 
						|
	ADDQ  R14, AX
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
 | 
						|
	MOVQ AX, 16(SP)
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the remaining
 | 
						|
	CMPQ SI, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
 | 
						|
	MOVQ BX, AX
 | 
						|
	SHRQ $0x03, AX
 | 
						|
	SUBQ AX, R13
 | 
						|
	MOVQ (R13), DX
 | 
						|
	SUBQ AX, SI
 | 
						|
	ANDQ $0x07, BX
 | 
						|
	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
 | 
						|
	CMPQ    SI, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
 | 
						|
	CMPQ    BX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
 | 
						|
	SHLQ    $0x08, DX
 | 
						|
	SUBQ    $0x01, R13
 | 
						|
	SUBQ    $0x01, SI
 | 
						|
	SUBQ    $0x08, BX
 | 
						|
	MOVBQZX (R13), AX
 | 
						|
	ORQ     AX, DX
 | 
						|
	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
 | 
						|
	CMPQ BX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
 | 
						|
	// Update literal length
 | 
						|
	MOVQ  DI, AX
 | 
						|
	MOVQ  BX, CX
 | 
						|
	MOVQ  DX, R14
 | 
						|
	SHLQ  CL, R14
 | 
						|
	MOVB  AH, CL
 | 
						|
	SHRQ  $0x20, AX
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
 | 
						|
	ADDQ  CX, BX
 | 
						|
	CMPQ  BX, $0x40
 | 
						|
	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
 | 
						|
	CMPQ  CX, $0x40
 | 
						|
	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
 | 
						|
	NEGQ  CX
 | 
						|
	SHRQ  CL, R14
 | 
						|
	ADDQ  R14, AX
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
 | 
						|
	MOVQ AX, 24(SP)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R13, (SP)
 | 
						|
	MOVQ    R9, AX
 | 
						|
	SHRQ    $0x08, AX
 | 
						|
	MOVBQZX AL, AX
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	MOVBQZX DI, R13
 | 
						|
	SHRL    $0x10, DI
 | 
						|
	LEAQ    (BX)(R13*1), CX
 | 
						|
	MOVQ    DX, R14
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	MOVL    $0x00000001, R15
 | 
						|
	MOVB    R13, CL
 | 
						|
	SHLL    CL, R15
 | 
						|
	DECL    R15
 | 
						|
	ANDQ    R15, R14
 | 
						|
	ADDQ    R14, DI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	MOVBQZX R8, R13
 | 
						|
	SHRL    $0x10, R8
 | 
						|
	LEAQ    (BX)(R13*1), CX
 | 
						|
	MOVQ    DX, R14
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	MOVL    $0x00000001, R15
 | 
						|
	MOVB    R13, CL
 | 
						|
	SHLL    CL, R15
 | 
						|
	DECL    R15
 | 
						|
	ANDQ    R15, R14
 | 
						|
	ADDQ    R14, R8
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	MOVBQZX R9, R13
 | 
						|
	SHRL    $0x10, R9
 | 
						|
	LEAQ    (BX)(R13*1), CX
 | 
						|
	MOVQ    DX, R14
 | 
						|
	MOVQ    CX, BX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	MOVL    $0x00000001, R15
 | 
						|
	MOVB    R13, CL
 | 
						|
	SHLL    CL, R15
 | 
						|
	DECL    R15
 | 
						|
	ANDQ    R15, R14
 | 
						|
	ADDQ    R14, R9
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R9*8), R9
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ   s+0(FP), CX
 | 
						|
	MOVQ   8(SP), R13
 | 
						|
	CMPQ   AX, $0x01
 | 
						|
	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
 | 
						|
	MOVUPS 144(CX), X0
 | 
						|
	MOVQ   R13, 144(CX)
 | 
						|
	MOVUPS X0, 152(CX)
 | 
						|
	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ 24(SP), $0x00000000
 | 
						|
	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
 | 
						|
	INCQ R13
 | 
						|
	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
 | 
						|
	MOVQ  144(CX), R13
 | 
						|
	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
 | 
						|
	MOVQ    R13, AX
 | 
						|
	XORQ    R14, R14
 | 
						|
	MOVQ    $-1, R15
 | 
						|
	CMPQ    R13, $0x03
 | 
						|
	CMOVQEQ R14, AX
 | 
						|
	CMOVQEQ R15, R14
 | 
						|
	ADDQ    144(CX)(AX*8), R14
 | 
						|
	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
 | 
						|
	MOVQ    $0x00000001, R14
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
 | 
						|
	CMPQ R13, $0x01
 | 
						|
	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
 | 
						|
	MOVQ 152(CX), AX
 | 
						|
	MOVQ AX, 160(CX)
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_adjust_skip:
 | 
						|
	MOVQ 144(CX), AX
 | 
						|
	MOVQ AX, 152(CX)
 | 
						|
	MOVQ R14, 144(CX)
 | 
						|
	MOVQ R14, R13
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_after_adjust:
 | 
						|
	MOVQ R13, 8(SP)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  16(SP), AX
 | 
						|
	MOVQ  24(SP), CX
 | 
						|
	LEAQ  (AX)(CX*1), R14
 | 
						|
	MOVQ  s+0(FP), R15
 | 
						|
	ADDQ  R14, 256(R15)
 | 
						|
	MOVQ  ctx+16(FP), R14
 | 
						|
	SUBQ  CX, 104(R14)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  AX, $0x00020002
 | 
						|
	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
 | 
						|
	TESTQ AX, AX
 | 
						|
	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
 | 
						|
	MOVQ 24(SP), AX
 | 
						|
	MOVQ 8(SP), CX
 | 
						|
	MOVQ 16(SP), R13
 | 
						|
 | 
						|
	// Check if we have enough space in s.out
 | 
						|
	LEAQ (AX)(R13*1), R14
 | 
						|
	ADDQ R10, R14
 | 
						|
	CMPQ R14, 32(SP)
 | 
						|
	JA   error_not_enough_space
 | 
						|
 | 
						|
	// Copy literals
 | 
						|
	TESTQ AX, AX
 | 
						|
	JZ    check_offset
 | 
						|
	MOVQ  AX, R14
 | 
						|
	SUBQ  $0x10, R14
 | 
						|
	JB    copy_1_small
 | 
						|
 | 
						|
copy_1_loop:
 | 
						|
	MOVUPS (R11), X0
 | 
						|
	MOVUPS X0, (R10)
 | 
						|
	ADDQ   $0x10, R11
 | 
						|
	ADDQ   $0x10, R10
 | 
						|
	SUBQ   $0x10, R14
 | 
						|
	JAE    copy_1_loop
 | 
						|
	LEAQ   16(R11)(R14*1), R11
 | 
						|
	LEAQ   16(R10)(R14*1), R10
 | 
						|
	MOVUPS -16(R11), X0
 | 
						|
	MOVUPS X0, -16(R10)
 | 
						|
	JMP    copy_1_end
 | 
						|
 | 
						|
copy_1_small:
 | 
						|
	CMPQ AX, $0x03
 | 
						|
	JE   copy_1_move_3
 | 
						|
	JB   copy_1_move_1or2
 | 
						|
	CMPQ AX, $0x08
 | 
						|
	JB   copy_1_move_4through7
 | 
						|
	JMP  copy_1_move_8through16
 | 
						|
 | 
						|
copy_1_move_1or2:
 | 
						|
	MOVB (R11), R14
 | 
						|
	MOVB -1(R11)(AX*1), R15
 | 
						|
	MOVB R14, (R10)
 | 
						|
	MOVB R15, -1(R10)(AX*1)
 | 
						|
	ADDQ AX, R11
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_3:
 | 
						|
	MOVW (R11), R14
 | 
						|
	MOVB 2(R11), R15
 | 
						|
	MOVW R14, (R10)
 | 
						|
	MOVB R15, 2(R10)
 | 
						|
	ADDQ AX, R11
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_4through7:
 | 
						|
	MOVL (R11), R14
 | 
						|
	MOVL -4(R11)(AX*1), R15
 | 
						|
	MOVL R14, (R10)
 | 
						|
	MOVL R15, -4(R10)(AX*1)
 | 
						|
	ADDQ AX, R11
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_8through16:
 | 
						|
	MOVQ (R11), R14
 | 
						|
	MOVQ -8(R11)(AX*1), R15
 | 
						|
	MOVQ R14, (R10)
 | 
						|
	MOVQ R15, -8(R10)(AX*1)
 | 
						|
	ADDQ AX, R11
 | 
						|
	ADDQ AX, R10
 | 
						|
 | 
						|
copy_1_end:
 | 
						|
	ADDQ AX, R12
 | 
						|
 | 
						|
	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
 | 
						|
check_offset:
 | 
						|
	MOVQ R12, AX
 | 
						|
	ADDQ 40(SP), AX
 | 
						|
	CMPQ CX, AX
 | 
						|
	JG   error_match_off_too_big
 | 
						|
	CMPQ CX, 56(SP)
 | 
						|
	JG   error_match_off_too_big
 | 
						|
 | 
						|
	// Copy match from history
 | 
						|
	MOVQ CX, AX
 | 
						|
	SUBQ R12, AX
 | 
						|
	JLS  copy_match
 | 
						|
	MOVQ 48(SP), R14
 | 
						|
	SUBQ AX, R14
 | 
						|
	CMPQ R13, AX
 | 
						|
	JG   copy_all_from_history
 | 
						|
	MOVQ R13, AX
 | 
						|
	SUBQ $0x10, AX
 | 
						|
	JB   copy_4_small
 | 
						|
 | 
						|
copy_4_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R10)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R10
 | 
						|
	SUBQ   $0x10, AX
 | 
						|
	JAE    copy_4_loop
 | 
						|
	LEAQ   16(R14)(AX*1), R14
 | 
						|
	LEAQ   16(R10)(AX*1), R10
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R10)
 | 
						|
	JMP    copy_4_end
 | 
						|
 | 
						|
copy_4_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_4_move_3
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_4_move_4through7
 | 
						|
	JMP  copy_4_move_8through16
 | 
						|
 | 
						|
copy_4_move_3:
 | 
						|
	MOVW (R14), AX
 | 
						|
	MOVB 2(R14), CL
 | 
						|
	MOVW AX, (R10)
 | 
						|
	MOVB CL, 2(R10)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R10
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_4through7:
 | 
						|
	MOVL (R14), AX
 | 
						|
	MOVL -4(R14)(R13*1), CX
 | 
						|
	MOVL AX, (R10)
 | 
						|
	MOVL CX, -4(R10)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R10
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_8through16:
 | 
						|
	MOVQ (R14), AX
 | 
						|
	MOVQ -8(R14)(R13*1), CX
 | 
						|
	MOVQ AX, (R10)
 | 
						|
	MOVQ CX, -8(R10)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R10
 | 
						|
 | 
						|
copy_4_end:
 | 
						|
	ADDQ R13, R12
 | 
						|
	JMP  handle_loop
 | 
						|
	JMP loop_finished
 | 
						|
 | 
						|
copy_all_from_history:
 | 
						|
	MOVQ AX, R15
 | 
						|
	SUBQ $0x10, R15
 | 
						|
	JB   copy_5_small
 | 
						|
 | 
						|
copy_5_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R10)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R10
 | 
						|
	SUBQ   $0x10, R15
 | 
						|
	JAE    copy_5_loop
 | 
						|
	LEAQ   16(R14)(R15*1), R14
 | 
						|
	LEAQ   16(R10)(R15*1), R10
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R10)
 | 
						|
	JMP    copy_5_end
 | 
						|
 | 
						|
copy_5_small:
 | 
						|
	CMPQ AX, $0x03
 | 
						|
	JE   copy_5_move_3
 | 
						|
	JB   copy_5_move_1or2
 | 
						|
	CMPQ AX, $0x08
 | 
						|
	JB   copy_5_move_4through7
 | 
						|
	JMP  copy_5_move_8through16
 | 
						|
 | 
						|
copy_5_move_1or2:
 | 
						|
	MOVB (R14), R15
 | 
						|
	MOVB -1(R14)(AX*1), BP
 | 
						|
	MOVB R15, (R10)
 | 
						|
	MOVB BP, -1(R10)(AX*1)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_3:
 | 
						|
	MOVW (R14), R15
 | 
						|
	MOVB 2(R14), BP
 | 
						|
	MOVW R15, (R10)
 | 
						|
	MOVB BP, 2(R10)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_4through7:
 | 
						|
	MOVL (R14), R15
 | 
						|
	MOVL -4(R14)(AX*1), BP
 | 
						|
	MOVL R15, (R10)
 | 
						|
	MOVL BP, -4(R10)(AX*1)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_8through16:
 | 
						|
	MOVQ (R14), R15
 | 
						|
	MOVQ -8(R14)(AX*1), BP
 | 
						|
	MOVQ R15, (R10)
 | 
						|
	MOVQ BP, -8(R10)(AX*1)
 | 
						|
	ADDQ AX, R14
 | 
						|
	ADDQ AX, R10
 | 
						|
 | 
						|
copy_5_end:
 | 
						|
	ADDQ AX, R12
 | 
						|
	SUBQ AX, R13
 | 
						|
 | 
						|
	// Copy match from the current buffer
 | 
						|
copy_match:
 | 
						|
	MOVQ R10, AX
 | 
						|
	SUBQ CX, AX
 | 
						|
 | 
						|
	// ml <= mo
 | 
						|
	CMPQ R13, CX
 | 
						|
	JA   copy_overlapping_match
 | 
						|
 | 
						|
	// Copy non-overlapping match
 | 
						|
	ADDQ R13, R12
 | 
						|
	MOVQ R13, CX
 | 
						|
	SUBQ $0x10, CX
 | 
						|
	JB   copy_2_small
 | 
						|
 | 
						|
copy_2_loop:
 | 
						|
	MOVUPS (AX), X0
 | 
						|
	MOVUPS X0, (R10)
 | 
						|
	ADDQ   $0x10, AX
 | 
						|
	ADDQ   $0x10, R10
 | 
						|
	SUBQ   $0x10, CX
 | 
						|
	JAE    copy_2_loop
 | 
						|
	LEAQ   16(AX)(CX*1), AX
 | 
						|
	LEAQ   16(R10)(CX*1), R10
 | 
						|
	MOVUPS -16(AX), X0
 | 
						|
	MOVUPS X0, -16(R10)
 | 
						|
	JMP    copy_2_end
 | 
						|
 | 
						|
copy_2_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_2_move_3
 | 
						|
	JB   copy_2_move_1or2
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_2_move_4through7
 | 
						|
	JMP  copy_2_move_8through16
 | 
						|
 | 
						|
copy_2_move_1or2:
 | 
						|
	MOVB (AX), CL
 | 
						|
	MOVB -1(AX)(R13*1), R14
 | 
						|
	MOVB CL, (R10)
 | 
						|
	MOVB R14, -1(R10)(R13*1)
 | 
						|
	ADDQ R13, AX
 | 
						|
	ADDQ R13, R10
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_3:
 | 
						|
	MOVW (AX), CX
 | 
						|
	MOVB 2(AX), R14
 | 
						|
	MOVW CX, (R10)
 | 
						|
	MOVB R14, 2(R10)
 | 
						|
	ADDQ R13, AX
 | 
						|
	ADDQ R13, R10
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_4through7:
 | 
						|
	MOVL (AX), CX
 | 
						|
	MOVL -4(AX)(R13*1), R14
 | 
						|
	MOVL CX, (R10)
 | 
						|
	MOVL R14, -4(R10)(R13*1)
 | 
						|
	ADDQ R13, AX
 | 
						|
	ADDQ R13, R10
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_8through16:
 | 
						|
	MOVQ (AX), CX
 | 
						|
	MOVQ -8(AX)(R13*1), R14
 | 
						|
	MOVQ CX, (R10)
 | 
						|
	MOVQ R14, -8(R10)(R13*1)
 | 
						|
	ADDQ R13, AX
 | 
						|
	ADDQ R13, R10
 | 
						|
 | 
						|
copy_2_end:
 | 
						|
	JMP handle_loop
 | 
						|
 | 
						|
	// Copy overlapping match
 | 
						|
copy_overlapping_match:
 | 
						|
	ADDQ R13, R12
 | 
						|
 | 
						|
copy_slow_3:
 | 
						|
	MOVB (AX), CL
 | 
						|
	MOVB CL, (R10)
 | 
						|
	INCQ AX
 | 
						|
	INCQ R10
 | 
						|
	DECQ R13
 | 
						|
	JNZ  copy_slow_3
 | 
						|
 | 
						|
handle_loop:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	DECQ 96(AX)
 | 
						|
	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
 | 
						|
 | 
						|
loop_finished:
 | 
						|
	MOVQ br+8(FP), AX
 | 
						|
	MOVQ DX, 24(AX)
 | 
						|
	MOVB BL, 40(AX)
 | 
						|
	MOVQ SI, 32(AX)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ R12, 136(AX)
 | 
						|
	MOVQ 144(AX), CX
 | 
						|
	SUBQ CX, R11
 | 
						|
	MOVQ R11, 168(AX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ 16(SP), AX
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ AX, 216(CX)
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
error_match_off_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 8(SP), CX
 | 
						|
	MOVQ CX, 224(AX)
 | 
						|
	MOVQ R12, 136(AX)
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough output space error
 | 
						|
error_not_enough_space:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ R12, 136(AX)
 | 
						|
	MOVQ $0x00000005, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 | 
						|
// Requires: BMI, BMI2, CMOV, SSE
 | 
						|
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
 | 
						|
	MOVQ    br+8(FP), BX
 | 
						|
	MOVQ    24(BX), AX
 | 
						|
	MOVBQZX 40(BX), DX
 | 
						|
	MOVQ    (BX), CX
 | 
						|
	MOVQ    32(BX), BX
 | 
						|
	ADDQ    BX, CX
 | 
						|
	MOVQ    CX, (SP)
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	MOVQ    72(CX), SI
 | 
						|
	MOVQ    80(CX), DI
 | 
						|
	MOVQ    88(CX), R8
 | 
						|
	XORQ    R9, R9
 | 
						|
	MOVQ    R9, 8(SP)
 | 
						|
	MOVQ    R9, 16(SP)
 | 
						|
	MOVQ    R9, 24(SP)
 | 
						|
	MOVQ    112(CX), R9
 | 
						|
	MOVQ    128(CX), R10
 | 
						|
	MOVQ    R10, 32(SP)
 | 
						|
	MOVQ    144(CX), R10
 | 
						|
	MOVQ    136(CX), R11
 | 
						|
	MOVQ    200(CX), R12
 | 
						|
	MOVQ    R12, 56(SP)
 | 
						|
	MOVQ    176(CX), R12
 | 
						|
	MOVQ    R12, 48(SP)
 | 
						|
	MOVQ    184(CX), CX
 | 
						|
	MOVQ    CX, 40(SP)
 | 
						|
	MOVQ    40(SP), CX
 | 
						|
	ADDQ    CX, 48(SP)
 | 
						|
 | 
						|
	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 | 
						|
	ADDQ R9, 32(SP)
 | 
						|
 | 
						|
	// outBase += outPosition
 | 
						|
	ADDQ R11, R9
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_main_loop:
 | 
						|
	MOVQ (SP), R12
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the offset and match length.
 | 
						|
	CMPQ BX, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
 | 
						|
	MOVQ DX, CX
 | 
						|
	SHRQ $0x03, CX
 | 
						|
	SUBQ CX, R12
 | 
						|
	MOVQ (R12), AX
 | 
						|
	SUBQ CX, BX
 | 
						|
	ANDQ $0x07, DX
 | 
						|
	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
 | 
						|
	CMPQ    BX, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
 | 
						|
	CMPQ    DX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
 | 
						|
	SHLQ    $0x08, AX
 | 
						|
	SUBQ    $0x01, R12
 | 
						|
	SUBQ    $0x01, BX
 | 
						|
	SUBQ    $0x08, DX
 | 
						|
	MOVBQZX (R12), CX
 | 
						|
	ORQ     CX, AX
 | 
						|
	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
 | 
						|
	CMPQ DX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_fill_end:
 | 
						|
	// Update offset
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, R8, R13
 | 
						|
	MOVQ   AX, R14
 | 
						|
	LEAQ   (DX)(R13*1), CX
 | 
						|
	ROLQ   CL, R14
 | 
						|
	BZHIQ  R13, R14, R14
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   R8, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R14, CX
 | 
						|
	MOVQ   CX, 8(SP)
 | 
						|
 | 
						|
	// Update match length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, DI, R13
 | 
						|
	MOVQ   AX, R14
 | 
						|
	LEAQ   (DX)(R13*1), CX
 | 
						|
	ROLQ   CL, R14
 | 
						|
	BZHIQ  R13, R14, R14
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   DI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R14, CX
 | 
						|
	MOVQ   CX, 16(SP)
 | 
						|
 | 
						|
	// Fill bitreader to have enough for the remaining
 | 
						|
	CMPQ BX, $0x08
 | 
						|
	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
 | 
						|
	MOVQ DX, CX
 | 
						|
	SHRQ $0x03, CX
 | 
						|
	SUBQ CX, R12
 | 
						|
	MOVQ (R12), AX
 | 
						|
	SUBQ CX, BX
 | 
						|
	ANDQ $0x07, DX
 | 
						|
	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
 | 
						|
	CMPQ    BX, $0x00
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
 | 
						|
	CMPQ    DX, $0x07
 | 
						|
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
 | 
						|
	SHLQ    $0x08, AX
 | 
						|
	SUBQ    $0x01, R12
 | 
						|
	SUBQ    $0x01, BX
 | 
						|
	SUBQ    $0x08, DX
 | 
						|
	MOVBQZX (R12), CX
 | 
						|
	ORQ     CX, AX
 | 
						|
	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
 | 
						|
	CMPQ DX, $0x40
 | 
						|
	JA   error_overread
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
 | 
						|
	// Update literal length
 | 
						|
	MOVQ   $0x00000808, CX
 | 
						|
	BEXTRQ CX, SI, R13
 | 
						|
	MOVQ   AX, R14
 | 
						|
	LEAQ   (DX)(R13*1), CX
 | 
						|
	ROLQ   CL, R14
 | 
						|
	BZHIQ  R13, R14, R14
 | 
						|
	MOVQ   CX, DX
 | 
						|
	MOVQ   SI, CX
 | 
						|
	SHRQ   $0x20, CX
 | 
						|
	ADDQ   R14, CX
 | 
						|
	MOVQ   CX, 24(SP)
 | 
						|
 | 
						|
	// Fill bitreader for state updates
 | 
						|
	MOVQ    R12, (SP)
 | 
						|
	MOVQ    $0x00000808, CX
 | 
						|
	BEXTRQ  CX, R8, R12
 | 
						|
	MOVQ    ctx+16(FP), CX
 | 
						|
	CMPQ    96(CX), $0x00
 | 
						|
	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
 | 
						|
	LEAQ    (SI)(DI*1), R13
 | 
						|
	ADDQ    R8, R13
 | 
						|
	MOVBQZX R13, R13
 | 
						|
	LEAQ    (DX)(R13*1), CX
 | 
						|
	MOVQ    AX, R14
 | 
						|
	MOVQ    CX, DX
 | 
						|
	ROLQ    CL, R14
 | 
						|
	BZHIQ   R13, R14, R14
 | 
						|
 | 
						|
	// Update Offset State
 | 
						|
	BZHIQ R8, R14, CX
 | 
						|
	SHRXQ R8, R14, R14
 | 
						|
	SHRL  $0x10, R8
 | 
						|
	ADDQ  CX, R8
 | 
						|
 | 
						|
	// Load ctx.ofTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 48(CX), CX
 | 
						|
	MOVQ (CX)(R8*8), R8
 | 
						|
 | 
						|
	// Update Match Length State
 | 
						|
	BZHIQ DI, R14, CX
 | 
						|
	SHRXQ DI, R14, R14
 | 
						|
	SHRL  $0x10, DI
 | 
						|
	ADDQ  CX, DI
 | 
						|
 | 
						|
	// Load ctx.mlTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ 24(CX), CX
 | 
						|
	MOVQ (CX)(DI*8), DI
 | 
						|
 | 
						|
	// Update Literal Length State
 | 
						|
	BZHIQ SI, R14, CX
 | 
						|
	SHRL  $0x10, SI
 | 
						|
	ADDQ  CX, SI
 | 
						|
 | 
						|
	// Load ctx.llTable
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ (CX), CX
 | 
						|
	MOVQ (CX)(SI*8), SI
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_skip_update:
 | 
						|
	// Adjust offset
 | 
						|
	MOVQ   s+0(FP), CX
 | 
						|
	MOVQ   8(SP), R13
 | 
						|
	CMPQ   R12, $0x01
 | 
						|
	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
 | 
						|
	MOVUPS 144(CX), X0
 | 
						|
	MOVQ   R13, 144(CX)
 | 
						|
	MOVUPS X0, 152(CX)
 | 
						|
	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
 | 
						|
	CMPQ 24(SP), $0x00000000
 | 
						|
	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
 | 
						|
	INCQ R13
 | 
						|
	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
 | 
						|
	MOVQ  144(CX), R13
 | 
						|
	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
 | 
						|
	MOVQ    R13, R12
 | 
						|
	XORQ    R14, R14
 | 
						|
	MOVQ    $-1, R15
 | 
						|
	CMPQ    R13, $0x03
 | 
						|
	CMOVQEQ R14, R12
 | 
						|
	CMOVQEQ R15, R14
 | 
						|
	ADDQ    144(CX)(R12*8), R14
 | 
						|
	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
 | 
						|
	MOVQ    $0x00000001, R14
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
 | 
						|
	CMPQ R13, $0x01
 | 
						|
	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
 | 
						|
	MOVQ 152(CX), R12
 | 
						|
	MOVQ R12, 160(CX)
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
 | 
						|
	MOVQ 144(CX), R12
 | 
						|
	MOVQ R12, 152(CX)
 | 
						|
	MOVQ R14, 144(CX)
 | 
						|
	MOVQ R14, R13
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_after_adjust:
 | 
						|
	MOVQ R13, 8(SP)
 | 
						|
 | 
						|
	// Check values
 | 
						|
	MOVQ  16(SP), CX
 | 
						|
	MOVQ  24(SP), R12
 | 
						|
	LEAQ  (CX)(R12*1), R14
 | 
						|
	MOVQ  s+0(FP), R15
 | 
						|
	ADDQ  R14, 256(R15)
 | 
						|
	MOVQ  ctx+16(FP), R14
 | 
						|
	SUBQ  R12, 104(R14)
 | 
						|
	JS    error_not_enough_literals
 | 
						|
	CMPQ  CX, $0x00020002
 | 
						|
	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
 | 
						|
	TESTQ R13, R13
 | 
						|
	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
 | 
						|
	TESTQ CX, CX
 | 
						|
	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
 | 
						|
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ 8(SP), R12
 | 
						|
	MOVQ 16(SP), R13
 | 
						|
 | 
						|
	// Check if we have enough space in s.out
 | 
						|
	LEAQ (CX)(R13*1), R14
 | 
						|
	ADDQ R9, R14
 | 
						|
	CMPQ R14, 32(SP)
 | 
						|
	JA   error_not_enough_space
 | 
						|
 | 
						|
	// Copy literals
 | 
						|
	TESTQ CX, CX
 | 
						|
	JZ    check_offset
 | 
						|
	MOVQ  CX, R14
 | 
						|
	SUBQ  $0x10, R14
 | 
						|
	JB    copy_1_small
 | 
						|
 | 
						|
copy_1_loop:
 | 
						|
	MOVUPS (R10), X0
 | 
						|
	MOVUPS X0, (R9)
 | 
						|
	ADDQ   $0x10, R10
 | 
						|
	ADDQ   $0x10, R9
 | 
						|
	SUBQ   $0x10, R14
 | 
						|
	JAE    copy_1_loop
 | 
						|
	LEAQ   16(R10)(R14*1), R10
 | 
						|
	LEAQ   16(R9)(R14*1), R9
 | 
						|
	MOVUPS -16(R10), X0
 | 
						|
	MOVUPS X0, -16(R9)
 | 
						|
	JMP    copy_1_end
 | 
						|
 | 
						|
copy_1_small:
 | 
						|
	CMPQ CX, $0x03
 | 
						|
	JE   copy_1_move_3
 | 
						|
	JB   copy_1_move_1or2
 | 
						|
	CMPQ CX, $0x08
 | 
						|
	JB   copy_1_move_4through7
 | 
						|
	JMP  copy_1_move_8through16
 | 
						|
 | 
						|
copy_1_move_1or2:
 | 
						|
	MOVB (R10), R14
 | 
						|
	MOVB -1(R10)(CX*1), R15
 | 
						|
	MOVB R14, (R9)
 | 
						|
	MOVB R15, -1(R9)(CX*1)
 | 
						|
	ADDQ CX, R10
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_3:
 | 
						|
	MOVW (R10), R14
 | 
						|
	MOVB 2(R10), R15
 | 
						|
	MOVW R14, (R9)
 | 
						|
	MOVB R15, 2(R9)
 | 
						|
	ADDQ CX, R10
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_4through7:
 | 
						|
	MOVL (R10), R14
 | 
						|
	MOVL -4(R10)(CX*1), R15
 | 
						|
	MOVL R14, (R9)
 | 
						|
	MOVL R15, -4(R9)(CX*1)
 | 
						|
	ADDQ CX, R10
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_1_end
 | 
						|
 | 
						|
copy_1_move_8through16:
 | 
						|
	MOVQ (R10), R14
 | 
						|
	MOVQ -8(R10)(CX*1), R15
 | 
						|
	MOVQ R14, (R9)
 | 
						|
	MOVQ R15, -8(R9)(CX*1)
 | 
						|
	ADDQ CX, R10
 | 
						|
	ADDQ CX, R9
 | 
						|
 | 
						|
copy_1_end:
 | 
						|
	ADDQ CX, R11
 | 
						|
 | 
						|
	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
 | 
						|
check_offset:
 | 
						|
	MOVQ R11, CX
 | 
						|
	ADDQ 40(SP), CX
 | 
						|
	CMPQ R12, CX
 | 
						|
	JG   error_match_off_too_big
 | 
						|
	CMPQ R12, 56(SP)
 | 
						|
	JG   error_match_off_too_big
 | 
						|
 | 
						|
	// Copy match from history
 | 
						|
	MOVQ R12, CX
 | 
						|
	SUBQ R11, CX
 | 
						|
	JLS  copy_match
 | 
						|
	MOVQ 48(SP), R14
 | 
						|
	SUBQ CX, R14
 | 
						|
	CMPQ R13, CX
 | 
						|
	JG   copy_all_from_history
 | 
						|
	MOVQ R13, CX
 | 
						|
	SUBQ $0x10, CX
 | 
						|
	JB   copy_4_small
 | 
						|
 | 
						|
copy_4_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R9)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R9
 | 
						|
	SUBQ   $0x10, CX
 | 
						|
	JAE    copy_4_loop
 | 
						|
	LEAQ   16(R14)(CX*1), R14
 | 
						|
	LEAQ   16(R9)(CX*1), R9
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R9)
 | 
						|
	JMP    copy_4_end
 | 
						|
 | 
						|
copy_4_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_4_move_3
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_4_move_4through7
 | 
						|
	JMP  copy_4_move_8through16
 | 
						|
 | 
						|
copy_4_move_3:
 | 
						|
	MOVW (R14), CX
 | 
						|
	MOVB 2(R14), R12
 | 
						|
	MOVW CX, (R9)
 | 
						|
	MOVB R12, 2(R9)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R9
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_4through7:
 | 
						|
	MOVL (R14), CX
 | 
						|
	MOVL -4(R14)(R13*1), R12
 | 
						|
	MOVL CX, (R9)
 | 
						|
	MOVL R12, -4(R9)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R9
 | 
						|
	JMP  copy_4_end
 | 
						|
 | 
						|
copy_4_move_8through16:
 | 
						|
	MOVQ (R14), CX
 | 
						|
	MOVQ -8(R14)(R13*1), R12
 | 
						|
	MOVQ CX, (R9)
 | 
						|
	MOVQ R12, -8(R9)(R13*1)
 | 
						|
	ADDQ R13, R14
 | 
						|
	ADDQ R13, R9
 | 
						|
 | 
						|
copy_4_end:
 | 
						|
	ADDQ R13, R11
 | 
						|
	JMP  handle_loop
 | 
						|
	JMP loop_finished
 | 
						|
 | 
						|
copy_all_from_history:
 | 
						|
	MOVQ CX, R15
 | 
						|
	SUBQ $0x10, R15
 | 
						|
	JB   copy_5_small
 | 
						|
 | 
						|
copy_5_loop:
 | 
						|
	MOVUPS (R14), X0
 | 
						|
	MOVUPS X0, (R9)
 | 
						|
	ADDQ   $0x10, R14
 | 
						|
	ADDQ   $0x10, R9
 | 
						|
	SUBQ   $0x10, R15
 | 
						|
	JAE    copy_5_loop
 | 
						|
	LEAQ   16(R14)(R15*1), R14
 | 
						|
	LEAQ   16(R9)(R15*1), R9
 | 
						|
	MOVUPS -16(R14), X0
 | 
						|
	MOVUPS X0, -16(R9)
 | 
						|
	JMP    copy_5_end
 | 
						|
 | 
						|
copy_5_small:
 | 
						|
	CMPQ CX, $0x03
 | 
						|
	JE   copy_5_move_3
 | 
						|
	JB   copy_5_move_1or2
 | 
						|
	CMPQ CX, $0x08
 | 
						|
	JB   copy_5_move_4through7
 | 
						|
	JMP  copy_5_move_8through16
 | 
						|
 | 
						|
copy_5_move_1or2:
 | 
						|
	MOVB (R14), R15
 | 
						|
	MOVB -1(R14)(CX*1), BP
 | 
						|
	MOVB R15, (R9)
 | 
						|
	MOVB BP, -1(R9)(CX*1)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_3:
 | 
						|
	MOVW (R14), R15
 | 
						|
	MOVB 2(R14), BP
 | 
						|
	MOVW R15, (R9)
 | 
						|
	MOVB BP, 2(R9)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_4through7:
 | 
						|
	MOVL (R14), R15
 | 
						|
	MOVL -4(R14)(CX*1), BP
 | 
						|
	MOVL R15, (R9)
 | 
						|
	MOVL BP, -4(R9)(CX*1)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
	JMP  copy_5_end
 | 
						|
 | 
						|
copy_5_move_8through16:
 | 
						|
	MOVQ (R14), R15
 | 
						|
	MOVQ -8(R14)(CX*1), BP
 | 
						|
	MOVQ R15, (R9)
 | 
						|
	MOVQ BP, -8(R9)(CX*1)
 | 
						|
	ADDQ CX, R14
 | 
						|
	ADDQ CX, R9
 | 
						|
 | 
						|
copy_5_end:
 | 
						|
	ADDQ CX, R11
 | 
						|
	SUBQ CX, R13
 | 
						|
 | 
						|
	// Copy match from the current buffer
 | 
						|
copy_match:
 | 
						|
	MOVQ R9, CX
 | 
						|
	SUBQ R12, CX
 | 
						|
 | 
						|
	// ml <= mo
 | 
						|
	CMPQ R13, R12
 | 
						|
	JA   copy_overlapping_match
 | 
						|
 | 
						|
	// Copy non-overlapping match
 | 
						|
	ADDQ R13, R11
 | 
						|
	MOVQ R13, R12
 | 
						|
	SUBQ $0x10, R12
 | 
						|
	JB   copy_2_small
 | 
						|
 | 
						|
copy_2_loop:
 | 
						|
	MOVUPS (CX), X0
 | 
						|
	MOVUPS X0, (R9)
 | 
						|
	ADDQ   $0x10, CX
 | 
						|
	ADDQ   $0x10, R9
 | 
						|
	SUBQ   $0x10, R12
 | 
						|
	JAE    copy_2_loop
 | 
						|
	LEAQ   16(CX)(R12*1), CX
 | 
						|
	LEAQ   16(R9)(R12*1), R9
 | 
						|
	MOVUPS -16(CX), X0
 | 
						|
	MOVUPS X0, -16(R9)
 | 
						|
	JMP    copy_2_end
 | 
						|
 | 
						|
copy_2_small:
 | 
						|
	CMPQ R13, $0x03
 | 
						|
	JE   copy_2_move_3
 | 
						|
	JB   copy_2_move_1or2
 | 
						|
	CMPQ R13, $0x08
 | 
						|
	JB   copy_2_move_4through7
 | 
						|
	JMP  copy_2_move_8through16
 | 
						|
 | 
						|
copy_2_move_1or2:
 | 
						|
	MOVB (CX), R12
 | 
						|
	MOVB -1(CX)(R13*1), R14
 | 
						|
	MOVB R12, (R9)
 | 
						|
	MOVB R14, -1(R9)(R13*1)
 | 
						|
	ADDQ R13, CX
 | 
						|
	ADDQ R13, R9
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_3:
 | 
						|
	MOVW (CX), R12
 | 
						|
	MOVB 2(CX), R14
 | 
						|
	MOVW R12, (R9)
 | 
						|
	MOVB R14, 2(R9)
 | 
						|
	ADDQ R13, CX
 | 
						|
	ADDQ R13, R9
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_4through7:
 | 
						|
	MOVL (CX), R12
 | 
						|
	MOVL -4(CX)(R13*1), R14
 | 
						|
	MOVL R12, (R9)
 | 
						|
	MOVL R14, -4(R9)(R13*1)
 | 
						|
	ADDQ R13, CX
 | 
						|
	ADDQ R13, R9
 | 
						|
	JMP  copy_2_end
 | 
						|
 | 
						|
copy_2_move_8through16:
 | 
						|
	MOVQ (CX), R12
 | 
						|
	MOVQ -8(CX)(R13*1), R14
 | 
						|
	MOVQ R12, (R9)
 | 
						|
	MOVQ R14, -8(R9)(R13*1)
 | 
						|
	ADDQ R13, CX
 | 
						|
	ADDQ R13, R9
 | 
						|
 | 
						|
copy_2_end:
 | 
						|
	JMP handle_loop
 | 
						|
 | 
						|
	// Copy overlapping match
 | 
						|
copy_overlapping_match:
 | 
						|
	ADDQ R13, R11
 | 
						|
 | 
						|
copy_slow_3:
 | 
						|
	MOVB (CX), R12
 | 
						|
	MOVB R12, (R9)
 | 
						|
	INCQ CX
 | 
						|
	INCQ R9
 | 
						|
	DECQ R13
 | 
						|
	JNZ  copy_slow_3
 | 
						|
 | 
						|
handle_loop:
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	DECQ 96(CX)
 | 
						|
	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
 | 
						|
 | 
						|
loop_finished:
 | 
						|
	MOVQ br+8(FP), CX
 | 
						|
	MOVQ AX, 24(CX)
 | 
						|
	MOVB DL, 40(CX)
 | 
						|
	MOVQ BX, 32(CX)
 | 
						|
 | 
						|
	// Update the context
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ R11, 136(AX)
 | 
						|
	MOVQ 144(AX), CX
 | 
						|
	SUBQ CX, R10
 | 
						|
	MOVQ R10, 168(AX)
 | 
						|
 | 
						|
	// Return success
 | 
						|
	MOVQ $0x00000000, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match length error
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
 | 
						|
	MOVQ 16(SP), AX
 | 
						|
	MOVQ ctx+16(FP), CX
 | 
						|
	MOVQ AX, 216(CX)
 | 
						|
	MOVQ $0x00000001, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match too long error
 | 
						|
sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ $0x00000002, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with match offset too long error
 | 
						|
error_match_off_too_big:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 8(SP), CX
 | 
						|
	MOVQ CX, 224(AX)
 | 
						|
	MOVQ R11, 136(AX)
 | 
						|
	MOVQ $0x00000003, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough literals error
 | 
						|
error_not_enough_literals:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ $0x00000004, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with overread error
 | 
						|
error_overread:
 | 
						|
	MOVQ $0x00000006, ret+24(FP)
 | 
						|
	RET
 | 
						|
 | 
						|
	// Return with not enough output space error
 | 
						|
error_not_enough_space:
 | 
						|
	MOVQ ctx+16(FP), AX
 | 
						|
	MOVQ 24(SP), CX
 | 
						|
	MOVQ CX, 208(AX)
 | 
						|
	MOVQ 16(SP), CX
 | 
						|
	MOVQ CX, 216(AX)
 | 
						|
	MOVQ R11, 136(AX)
 | 
						|
	MOVQ $0x00000005, ret+24(FP)
 | 
						|
	RET
 |